source: rtems/bsps/m68k/shared/fpsp/setox.S @ b82a4b4

5
Last change on this file since b82a4b4 was 3cf2bf63, checked in by Sebastian Huber <sebastian.huber@…>, on 03/26/18 at 10:17:06

bsps/m68k: Move fpsp support to bsps

This patch is a part of the BSP source reorganization.

Update #3285.

  • Property mode set to 100644
File size: 28.4 KB
Line 
1#include "fpsp-namespace.h"
2//
3//
4//      setox.sa 3.1 12/10/90
5//
6//      The entry point setox computes the exponential of a value.
7//      setoxd does the same except the input value is a denormalized
8//      number. setoxm1 computes exp(X)-1, and setoxm1d computes
9//      exp(X)-1 for denormalized X.
10//
11//      INPUT
12//      -----
13//      Double-extended value in memory location pointed to by address
14//      register a0.
15//
16//      OUTPUT
17//      ------
18//      exp(X) or exp(X)-1 returned in floating-point register fp0.
19//
20//      ACCURACY and MONOTONICITY
21//      -------------------------
22//      The returned result is within 0.85 ulps in 64 significant bit, i.e.
23//      within 0.5001 ulp to 53 bits if the result is subsequently rounded
24//      to double precision. The result is provably monotonic in double
25//      precision.
26//
27//      SPEED
28//      -----
29//      Two timings are measured, both in the copy-back mode. The
30//      first one is measured when the function is invoked the first time
31//      (so the instructions and data are not in cache), and the
32//      second one is measured when the function is reinvoked at the same
33//      input argument.
34//
35//      The program setox takes approximately 210/190 cycles for input
36//      argument X whose magnitude is less than 16380 log2, which
37//      is the usual situation. For the less common arguments,
38//      depending on their values, the program may run faster or slower --
39//      but no worse than 10% slower even in the extreme cases.
40//
41//      The program setoxm1 takes approximately ???/??? cycles for input
42//      argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
43//      approximately ???/??? cycles. For the less common arguments,
44//      depending on their values, the program may run faster or slower --
45//      but no worse than 10% slower even in the extreme cases.
46//
47//      ALGORITHM and IMPLEMENTATION NOTES
48//      ----------------------------------
49//
50//      setoxd
51//      ------
52//      Step 1. Set ans := 1.0
53//
54//      Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
55//      Notes:  This will always generate one exception -- inexact.
56//
57//
58//      setox
59//      -----
60//
61//      Step 1. Filter out extreme cases of input argument.
62//              1.1     If |X| >= 2^(-65), go to Step 1.3.
63//              1.2     Go to Step 7.
64//              1.3     If |X| < 16380 log(2), go to Step 2.
65//              1.4     Go to Step 8.
66//      Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
67//               To avoid the use of floating-point comparisons, a
68//               compact representation of |X| is used. This format is a
69//               32-bit integer, the upper (more significant) 16 bits are
70//               the sign and biased exponent field of |X|; the lower 16
71//               bits are the 16 most significant fraction (including the
72//               explicit bit) bits of |X|. Consequently, the comparisons
73//               in Steps 1.1 and 1.3 can be performed by integer comparison.
74//               Note also that the constant 16380 log(2) used in Step 1.3
75//               is also in the compact form. Thus taking the branch
76//               to Step 2 guarantees |X| < 16380 log(2). There is no harm
77//               to have a small number of cases where |X| is less than,
78//               but close to, 16380 log(2) and the branch to Step 9 is
79//               taken.
80//
81//      Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
82//              2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
83//              2.2     N := round-to-nearest-integer( X * 64/log2 ).
84//              2.3     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
85//              2.4     Calculate       M = (N - J)/64; so N = 64M + J.
86//              2.5     Calculate the address of the stored value of 2^(J/64).
87//              2.6     Create the value Scale = 2^M.
88//      Notes:  The calculation in 2.2 is really performed by
89//
90//                      Z := X * constant
91//                      N := round-to-nearest-integer(Z)
92//
93//               where
94//
95//                      constant := single-precision( 64/log 2 ).
96//
97//               Using a single-precision constant avoids memory access.
98//               Another effect of using a single-precision "constant" is
99//               that the calculated value Z is
100//
101//                      Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
102//
103//               This error has to be considered later in Steps 3 and 4.
104//
105//      Step 3. Calculate X - N*log2/64.
106//              3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
107//              3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
108//      Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
109//               the value      -log2/64        to 88 bits of accuracy.
110//               b) N*L1 is exact because N is no longer than 22 bits and
111//               L1 is no longer than 24 bits.
112//               c) The calculation X+N*L1 is also exact due to cancellation.
113//               Thus, R is practically X+N(L1+L2) to full 64 bits.
114//               d) It is important to estimate how large can |R| be after
115//               Step 3.2.
116//
117//                      N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
118//                      X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5
119//                      X*64/log2 - N   =       f - eps*X 64/log2
120//                      X - N*log2/64   =       f*log2/64 - eps*X
121//
122//
123//               Now |X| <= 16446 log2, thus
124//
125//                      |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
126//                                      <= 0.57 log2/64.
127//               This bound will be used in Step 4.
128//
129//      Step 4. Approximate exp(R)-1 by a polynomial
130//                      p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
131//      Notes:  a) In order to reduce memory access, the coefficients are
132//               made as "short" as possible: A1 (which is 1/2), A4 and A5
133//               are single precision; A2 and A3 are double precision.
134//               b) Even with the restrictions above,
135//                      |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
136//               Note that 0.0062 is slightly bigger than 0.57 log2/64.
137//               c) To fully utilize the pipeline, p is separated into
138//               two independent pieces of roughly equal complexities
139//                      p = [ R + R*S*(A2 + S*A4) ]     +
140//                              [ S*(A1 + S*(A3 + S*A5)) ]
141//               where S = R*R.
142//
143//      Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
144//                              ans := T + ( T*p + t)
145//               where T and t are the stored values for 2^(J/64).
146//      Notes:  2^(J/64) is stored as T and t where T+t approximates
147//               2^(J/64) to roughly 85 bits; T is in extended precision
148//               and t is in single precision. Note also that T is rounded
149//               to 62 bits so that the last two bits of T are zero. The
150//               reason for such a special form is that T-1, T-2, and T-8
151//               will all be exact --- a property that will give much
152//               more accurate computation of the function EXPM1.
153//
154//      Step 6. Reconstruction of exp(X)
155//                      exp(X) = 2^M * 2^(J/64) * exp(R).
156//              6.1     If AdjFlag = 0, go to 6.3
157//              6.2     ans := ans * AdjScale
158//              6.3     Restore the user FPCR
159//              6.4     Return ans := ans * Scale. Exit.
160//      Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
161//               |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
162//               neither overflow nor underflow. If AdjFlag = 1, that
163//               means that
164//                      X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
165//               Hence, exp(X) may overflow or underflow or neither.
166//               When that is the case, AdjScale = 2^(M1) where M1 is
167//               approximately M. Thus 6.2 will never cause over/underflow.
168//               Possible exception in 6.4 is overflow or underflow.
169//               The inexact exception is not generated in 6.4. Although
170//               one can argue that the inexact flag should always be
171//               raised, to simulate that exception cost to much than the
172//               flag is worth in practical uses.
173//
174//      Step 7. Return 1 + X.
175//              7.1     ans := X
176//              7.2     Restore user FPCR.
177//              7.3     Return ans := 1 + ans. Exit
178//      Notes:  For non-zero X, the inexact exception will always be
179//               raised by 7.3. That is the only exception raised by 7.3.
180//               Note also that we use the FMOVEM instruction to move X
181//               in Step 7.1 to avoid unnecessary trapping. (Although
182//               the FMOVEM may not seem relevant since X is normalized,
183//               the precaution will be useful in the library version of
184//               this code where the separate entry for denormalized inputs
185//               will be done away with.)
186//
187//      Step 8. Handle exp(X) where |X| >= 16380log2.
188//              8.1     If |X| > 16480 log2, go to Step 9.
189//              (mimic 2.2 - 2.6)
190//              8.2     N := round-to-integer( X * 64/log2 )
191//              8.3     Calculate J = N mod 64, J = 0,1,...,63
192//              8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
193//              8.5     Calculate the address of the stored value 2^(J/64).
194//              8.6     Create the values Scale = 2^M, AdjScale = 2^M1.
195//              8.7     Go to Step 3.
196//      Notes:  Refer to notes for 2.2 - 2.6.
197//
198//      Step 9. Handle exp(X), |X| > 16480 log2.
199//              9.1     If X < 0, go to 9.3
200//              9.2     ans := Huge, go to 9.4
201//              9.3     ans := Tiny.
202//              9.4     Restore user FPCR.
203//              9.5     Return ans := ans * ans. Exit.
204//      Notes:  Exp(X) will surely overflow or underflow, depending on
205//               X's sign. "Huge" and "Tiny" are respectively large/tiny
206//               extended-precision numbers whose square over/underflow
207//               with an inexact result. Thus, 9.5 always raises the
208//               inexact together with either overflow or underflow.
209//
210//
211//      setoxm1d
212//      --------
213//
214//      Step 1. Set ans := 0
215//
216//      Step 2. Return  ans := X + ans. Exit.
217//      Notes:  This will return X with the appropriate rounding
218//               precision prescribed by the user FPCR.
219//
220//      setoxm1
221//      -------
222//
223//      Step 1. Check |X|
224//              1.1     If |X| >= 1/4, go to Step 1.3.
225//              1.2     Go to Step 7.
226//              1.3     If |X| < 70 log(2), go to Step 2.
227//              1.4     Go to Step 10.
228//      Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
229//               However, it is conceivable |X| can be small very often
230//               because EXPM1 is intended to evaluate exp(X)-1 accurately
231//               when |X| is small. For further details on the comparisons,
232//               see the notes on Step 1 of setox.
233//
234//      Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
235//              2.1     N := round-to-nearest-integer( X * 64/log2 ).
236//              2.2     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
237//              2.3     Calculate       M = (N - J)/64; so N = 64M + J.
238//              2.4     Calculate the address of the stored value of 2^(J/64).
239//              2.5     Create the values Sc = 2^M and OnebySc := -2^(-M).
240//      Notes:  See the notes on Step 2 of setox.
241//
242//      Step 3. Calculate X - N*log2/64.
243//              3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
244//              3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
245//      Notes:  Applying the analysis of Step 3 of setox in this case
246//               shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
247//               this case).
248//
249//      Step 4. Approximate exp(R)-1 by a polynomial
250//                      p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
251//      Notes:  a) In order to reduce memory access, the coefficients are
252//               made as "short" as possible: A1 (which is 1/2), A5 and A6
253//               are single precision; A2, A3 and A4 are double precision.
254//               b) Even with the restriction above,
255//                      |p - (exp(R)-1)| <      |R| * 2^(-72.7)
256//               for all |R| <= 0.0055.
257//               c) To fully utilize the pipeline, p is separated into
258//               two independent pieces of roughly equal complexity
259//                      p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +
260//                              [ R + S*(A1 + S*(A3 + S*A5)) ]
261//               where S = R*R.
262//
263//      Step 5. Compute 2^(J/64)*p by
264//                              p := T*p
265//               where T and t are the stored values for 2^(J/64).
266//      Notes:  2^(J/64) is stored as T and t where T+t approximates
267//               2^(J/64) to roughly 85 bits; T is in extended precision
268//               and t is in single precision. Note also that T is rounded
269//               to 62 bits so that the last two bits of T are zero. The
270//               reason for such a special form is that T-1, T-2, and T-8
271//               will all be exact --- a property that will be exploited
272//               in Step 6 below. The total relative error in p is no
273//               bigger than 2^(-67.7) compared to the final result.
274//
275//      Step 6. Reconstruction of exp(X)-1
276//                      exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
277//              6.1     If M <= 63, go to Step 6.3.
278//              6.2     ans := T + (p + (t + OnebySc)). Go to 6.6
279//              6.3     If M >= -3, go to 6.5.
280//              6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6
281//              6.5     ans := (T + OnebySc) + (p + t).
282//              6.6     Restore user FPCR.
283//              6.7     Return ans := Sc * ans. Exit.
284//      Notes:  The various arrangements of the expressions give accurate
285//               evaluations.
286//
287//      Step 7. exp(X)-1 for |X| < 1/4.
288//              7.1     If |X| >= 2^(-65), go to Step 9.
289//              7.2     Go to Step 8.
290//
291//      Step 8. Calculate exp(X)-1, |X| < 2^(-65).
292//              8.1     If |X| < 2^(-16312), goto 8.3
293//              8.2     Restore FPCR; return ans := X - 2^(-16382). Exit.
294//              8.3     X := X * 2^(140).
295//              8.4     Restore FPCR; ans := ans - 2^(-16382).
296//               Return ans := ans*2^(140). Exit
297//      Notes:  The idea is to return "X - tiny" under the user
298//               precision and rounding modes. To avoid unnecessary
299//               inefficiency, we stay away from denormalized numbers the
300//               best we can. For |X| >= 2^(-16312), the straightforward
301//               8.2 generates the inexact exception as the case warrants.
302//
303//      Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
304//                      p = X + X*X*(B1 + X*(B2 + ... + X*B12))
305//      Notes:  a) In order to reduce memory access, the coefficients are
306//               made as "short" as possible: B1 (which is 1/2), B9 to B12
307//               are single precision; B3 to B8 are double precision; and
308//               B2 is double extended.
309//               b) Even with the restriction above,
310//                      |p - (exp(X)-1)| < |X| 2^(-70.6)
311//               for all |X| <= 0.251.
312//               Note that 0.251 is slightly bigger than 1/4.
313//               c) To fully preserve accuracy, the polynomial is computed
314//               as     X + ( S*B1 +    Q ) where S = X*X and
315//                      Q       =       X*S*(B2 + X*(B3 + ... + X*B12))
316//               d) To fully utilize the pipeline, Q is separated into
317//               two independent pieces of roughly equal complexity
318//                      Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
319//                              [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
320//
321//      Step 10.        Calculate exp(X)-1 for |X| >= 70 log 2.
322//              10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
323//               purposes. Therefore, go to Step 1 of setox.
324//              10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
325//               ans := -1
326//               Restore user FPCR
327//               Return ans := ans + 2^(-126). Exit.
328//      Notes:  10.2 will always create an inexact and return -1 + tiny
329//               in the user rounding precision and mode.
330//
331//
332
333//              Copyright (C) Motorola, Inc. 1990
334//                      All Rights Reserved
335//
336//      THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
337//      The copyright notice above does not evidence any
338//      actual or intended publication of such source code.
339
340//setox idnt    2,1 | Motorola 040 Floating Point Software Package
341
342        |section        8
343
344#include "fpsp.defs"
345
346L2:     .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
347
348EXPA3:  .long   0x3FA55555,0x55554431
349EXPA2:  .long   0x3FC55555,0x55554018
350
351HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
352TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
353
354EM1A4:  .long   0x3F811111,0x11174385
355EM1A3:  .long   0x3FA55555,0x55554F5A
356
357EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
358
359EM1B8:  .long   0x3EC71DE3,0xA5774682
360EM1B7:  .long   0x3EFA01A0,0x19D7CB68
361
362EM1B6:  .long   0x3F2A01A0,0x1A019DF3
363EM1B5:  .long   0x3F56C16C,0x16C170E2
364
365EM1B4:  .long   0x3F811111,0x11111111
366EM1B3:  .long   0x3FA55555,0x55555555
367
368EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
369        .long   0x00000000
370
371TWO140: .long   0x48B00000,0x00000000
372TWON140:        .long   0x37300000,0x00000000
373
374EXPTBL:
375        .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
376        .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
377        .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
378        .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
379        .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
380        .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
381        .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
382        .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
383        .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
384        .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
385        .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
386        .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
387        .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
388        .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
389        .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
390        .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
391        .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
392        .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
393        .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
394        .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
395        .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
396        .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
397        .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
398        .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
399        .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
400        .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
401        .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
402        .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
403        .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
404        .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
405        .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
406        .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
407        .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
408        .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
409        .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
410        .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
411        .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
412        .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
413        .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
414        .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
415        .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
416        .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
417        .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
418        .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
419        .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
420        .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
421        .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
422        .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
423        .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
424        .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
425        .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
426        .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
427        .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
428        .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
429        .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
430        .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
431        .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
432        .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
433        .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
434        .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
435        .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
436        .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
437        .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
438        .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
439
440        .set    ADJFLAG,L_SCR2
441        .set    SCALE,FP_SCR1
442        .set    ADJSCALE,FP_SCR2
443        .set    SC,FP_SCR3
444        .set    ONEBYSC,FP_SCR4
445
446        | xref  t_frcinx
447        |xref   t_extdnrm
448        |xref   t_unfl
449        |xref   t_ovfl
450
451        .global setoxd
452setoxd:
453//--entry point for EXP(X), X is denormalized
454        movel           (%a0),%d0
455        andil           #0x80000000,%d0
456        oril            #0x00800000,%d0         // ...sign(X)*2^(-126)
457        movel           %d0,-(%sp)
458        fmoves          #0x3F800000,%fp0
459        fmovel          %d1,%fpcr
460        fadds           (%sp)+,%fp0
461        bra             t_frcinx
462
463        .global setox
464setox:
465//--entry point for EXP(X), here X is finite, non-zero, and not NaN's
466
467//--Step 1.
468        movel           (%a0),%d0        // ...load part of input X
469        andil           #0x7FFF0000,%d0 // ...biased expo. of X
470        cmpil           #0x3FBE0000,%d0 // ...2^(-65)
471        bges            EXPC1           // ...normal case
472        bra             EXPSM
473
474EXPC1:
475//--The case |X| >= 2^(-65)
476        movew           4(%a0),%d0      // ...expo. and partial sig. of |X|
477        cmpil           #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits
478        blts            EXPMAIN  // ...normal case
479        bra             EXPBIG
480
481EXPMAIN:
482//--Step 2.
483//--This is the normal branch:  2^(-65) <= |X| < 16380 log2.
484        fmovex          (%a0),%fp0      // ...load input from (a0)
485
486        fmovex          %fp0,%fp1
487        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
488        fmovemx %fp2-%fp2/%fp3,-(%a7)           // ...save fp2
489        movel           #0,ADJFLAG(%a6)
490        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
491        lea             EXPTBL,%a1
492        fmovel          %d0,%fp0                // ...convert to floating-format
493
494        movel           %d0,L_SCR1(%a6) // ...save N temporarily
495        andil           #0x3F,%d0               // ...D0 is J = N mod 64
496        lsll            #4,%d0
497        addal           %d0,%a1         // ...address of 2^(J/64)
498        movel           L_SCR1(%a6),%d0
499        asrl            #6,%d0          // ...D0 is M
500        addiw           #0x3FFF,%d0     // ...biased expo. of 2^(M)
501        movew           L2,L_SCR1(%a6)  // ...prefetch L2, no need in CB
502
503EXPCONT1:
504//--Step 3.
505//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
506//--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
507        fmovex          %fp0,%fp2
508        fmuls           #0xBC317218,%fp0        // ...N * L1, L1 = lead(-log2/64)
509        fmulx           L2,%fp2         // ...N * L2, L1+L2 = -log2/64
510        faddx           %fp1,%fp0               // ...X + N*L1
511        faddx           %fp2,%fp0               // ...fp0 is R, reduced arg.
512//      MOVE.W          #$3FA5,EXPA3    ...load EXPA3 in cache
513
514//--Step 4.
515//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
516//-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
517//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
518//--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
519
520        fmovex          %fp0,%fp1
521        fmulx           %fp1,%fp1               // ...fp1 IS S = R*R
522
523        fmoves          #0x3AB60B70,%fp2        // ...fp2 IS A5
524//      MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
525
526        fmulx           %fp1,%fp2               // ...fp2 IS S*A5
527        fmovex          %fp1,%fp3
528        fmuls           #0x3C088895,%fp3        // ...fp3 IS S*A4
529
530        faddd           EXPA3,%fp2      // ...fp2 IS A3+S*A5
531        faddd           EXPA2,%fp3      // ...fp3 IS A2+S*A4
532
533        fmulx           %fp1,%fp2               // ...fp2 IS S*(A3+S*A5)
534        movew           %d0,SCALE(%a6)  // ...SCALE is 2^(M) in extended
535        clrw            SCALE+2(%a6)
536        movel           #0x80000000,SCALE+4(%a6)
537        clrl            SCALE+8(%a6)
538
539        fmulx           %fp1,%fp3               // ...fp3 IS S*(A2+S*A4)
540
541        fadds           #0x3F000000,%fp2        // ...fp2 IS A1+S*(A3+S*A5)
542        fmulx           %fp0,%fp3               // ...fp3 IS R*S*(A2+S*A4)
543
544        fmulx           %fp1,%fp2               // ...fp2 IS S*(A1+S*(A3+S*A5))
545        faddx           %fp3,%fp0               // ...fp0 IS R+R*S*(A2+S*A4),
546//                                      ...fp3 released
547
548        fmovex          (%a1)+,%fp1     // ...fp1 is lead. pt. of 2^(J/64)
549        faddx           %fp2,%fp0               // ...fp0 is EXP(R) - 1
550//                                      ...fp2 released
551
552//--Step 5
553//--final reconstruction process
554//--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
555
556        fmulx           %fp1,%fp0               // ...2^(J/64)*(Exp(R)-1)
557        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
558        fadds           (%a1),%fp0      // ...accurate 2^(J/64)
559
560        faddx           %fp1,%fp0               // ...2^(J/64) + 2^(J/64)*...
561        movel           ADJFLAG(%a6),%d0
562
563//--Step 6
564        tstl            %d0
565        beqs            NORMAL
566ADJUST:
567        fmulx           ADJSCALE(%a6),%fp0
568NORMAL:
569        fmovel          %d1,%FPCR               // ...restore user FPCR
570        fmulx           SCALE(%a6),%fp0 // ...multiply 2^(M)
571        bra             t_frcinx
572
573EXPSM:
574//--Step 7
575        fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized
576        fmovel          %d1,%FPCR
577        fadds           #0x3F800000,%fp0        // ...1+X in user mode
578        bra             t_frcinx
579
580EXPBIG:
581//--Step 8
582        cmpil           #0x400CB27C,%d0 // ...16480 log2
583        bgts            EXP2BIG
584//--Steps 8.2 -- 8.6
585        fmovex          (%a0),%fp0      // ...load input from (a0)
586
587        fmovex          %fp0,%fp1
588        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
589        fmovemx  %fp2-%fp2/%fp3,-(%a7)          // ...save fp2
590        movel           #1,ADJFLAG(%a6)
591        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
592        lea             EXPTBL,%a1
593        fmovel          %d0,%fp0                // ...convert to floating-format
594        movel           %d0,L_SCR1(%a6)                 // ...save N temporarily
595        andil           #0x3F,%d0                // ...D0 is J = N mod 64
596        lsll            #4,%d0
597        addal           %d0,%a1                 // ...address of 2^(J/64)
598        movel           L_SCR1(%a6),%d0
599        asrl            #6,%d0                  // ...D0 is K
600        movel           %d0,L_SCR1(%a6)                 // ...save K temporarily
601        asrl            #1,%d0                  // ...D0 is M1
602        subl            %d0,L_SCR1(%a6)                 // ...a1 is M
603        addiw           #0x3FFF,%d0             // ...biased expo. of 2^(M1)
604        movew           %d0,ADJSCALE(%a6)               // ...ADJSCALE := 2^(M1)
605        clrw            ADJSCALE+2(%a6)
606        movel           #0x80000000,ADJSCALE+4(%a6)
607        clrl            ADJSCALE+8(%a6)
608        movel           L_SCR1(%a6),%d0                 // ...D0 is M
609        addiw           #0x3FFF,%d0             // ...biased expo. of 2^(M)
610        bra             EXPCONT1                // ...go back to Step 3
611
612EXP2BIG:
613//--Step 9
614        fmovel          %d1,%FPCR
615        movel           (%a0),%d0
616        bclrb           #sign_bit,(%a0)         // ...setox always returns positive
617        cmpil           #0,%d0
618        blt             t_unfl
619        bra             t_ovfl
620
621        .global setoxm1d
622setoxm1d:
623//--entry point for EXPM1(X), here X is denormalized
624//--Step 0.
625        bra             t_extdnrm
626
627
628        .global setoxm1
629setoxm1:
630//--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
631
632//--Step 1.
633//--Step 1.1
634        movel           (%a0),%d0        // ...load part of input X
635        andil           #0x7FFF0000,%d0 // ...biased expo. of X
636        cmpil           #0x3FFD0000,%d0 // ...1/4
637        bges            EM1CON1  // ...|X| >= 1/4
638        bra             EM1SM
639
640EM1CON1:
641//--Step 1.3
642//--The case |X| >= 1/4
643        movew           4(%a0),%d0      // ...expo. and partial sig. of |X|
644        cmpil           #0x4004C215,%d0 // ...70log2 rounded up to 16 bits
645        bles            EM1MAIN  // ...1/4 <= |X| <= 70log2
646        bra             EM1BIG
647
648EM1MAIN:
649//--Step 2.
650//--This is the case:   1/4 <= |X| <= 70 log2.
651        fmovex          (%a0),%fp0      // ...load input from (a0)
652
653        fmovex          %fp0,%fp1
654        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
655        fmovemx %fp2-%fp2/%fp3,-(%a7)           // ...save fp2
656//      MOVE.W          #$3F81,EM1A4            ...prefetch in CB mode
657        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
658        lea             EXPTBL,%a1
659        fmovel          %d0,%fp0                // ...convert to floating-format
660
661        movel           %d0,L_SCR1(%a6)                 // ...save N temporarily
662        andil           #0x3F,%d0                // ...D0 is J = N mod 64
663        lsll            #4,%d0
664        addal           %d0,%a1                 // ...address of 2^(J/64)
665        movel           L_SCR1(%a6),%d0
666        asrl            #6,%d0                  // ...D0 is M
667        movel           %d0,L_SCR1(%a6)                 // ...save a copy of M
668//      MOVE.W          #$3FDC,L2               ...prefetch L2 in CB mode
669
670//--Step 3.
671//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
672//--a0 points to 2^(J/64), D0 and a1 both contain M
673        fmovex          %fp0,%fp2
674        fmuls           #0xBC317218,%fp0        // ...N * L1, L1 = lead(-log2/64)
675        fmulx           L2,%fp2         // ...N * L2, L1+L2 = -log2/64
676        faddx           %fp1,%fp0        // ...X + N*L1
677        faddx           %fp2,%fp0        // ...fp0 is R, reduced arg.
678//      MOVE.W          #$3FC5,EM1A2            ...load EM1A2 in cache
679        addiw           #0x3FFF,%d0             // ...D0 is biased expo. of 2^M
680
681//--Step 4.
682//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
683//-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
684//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
685//--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
686
687        fmovex          %fp0,%fp1
688        fmulx           %fp1,%fp1               // ...fp1 IS S = R*R
689
690        fmoves          #0x3950097B,%fp2        // ...fp2 IS a6
691//      MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
692
693        fmulx           %fp1,%fp2               // ...fp2 IS S*A6
694        fmovex          %fp1,%fp3
695        fmuls           #0x3AB60B6A,%fp3        // ...fp3 IS S*A5
696
697        faddd           EM1A4,%fp2      // ...fp2 IS A4+S*A6
698        faddd           EM1A3,%fp3      // ...fp3 IS A3+S*A5
699        movew           %d0,SC(%a6)             // ...SC is 2^(M) in extended
700        clrw            SC+2(%a6)
701        movel           #0x80000000,SC+4(%a6)
702        clrl            SC+8(%a6)
703
704        fmulx           %fp1,%fp2               // ...fp2 IS S*(A4+S*A6)
705        movel           L_SCR1(%a6),%d0         // ...D0 is     M
706        negw            %d0             // ...D0 is -M
707        fmulx           %fp1,%fp3               // ...fp3 IS S*(A3+S*A5)
708        addiw           #0x3FFF,%d0     // ...biased expo. of 2^(-M)
709        faddd           EM1A2,%fp2      // ...fp2 IS A2+S*(A4+S*A6)
710        fadds           #0x3F000000,%fp3        // ...fp3 IS A1+S*(A3+S*A5)
711
712        fmulx           %fp1,%fp2               // ...fp2 IS S*(A2+S*(A4+S*A6))
713        oriw            #0x8000,%d0     // ...signed/expo. of -2^(-M)
714        movew           %d0,ONEBYSC(%a6)        // ...OnebySc is -2^(-M)
715        clrw            ONEBYSC+2(%a6)
716        movel           #0x80000000,ONEBYSC+4(%a6)
717        clrl            ONEBYSC+8(%a6)
718        fmulx           %fp3,%fp1               // ...fp1 IS S*(A1+S*(A3+S*A5))
719//                                      ...fp3 released
720
721        fmulx           %fp0,%fp2               // ...fp2 IS R*S*(A2+S*(A4+S*A6))
722        faddx           %fp1,%fp0               // ...fp0 IS R+S*(A1+S*(A3+S*A5))
723//                                      ...fp1 released
724
725        faddx           %fp2,%fp0               // ...fp0 IS EXP(R)-1
726//                                      ...fp2 released
727        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
728
729//--Step 5
730//--Compute 2^(J/64)*p
731
732        fmulx           (%a1),%fp0      // ...2^(J/64)*(Exp(R)-1)
733
734//--Step 6
735//--Step 6.1
736        movel           L_SCR1(%a6),%d0         // ...retrieve M
737        cmpil           #63,%d0
738        bles            MLE63
739//--Step 6.2    M >= 64
740        fmoves          12(%a1),%fp1    // ...fp1 is t
741        faddx           ONEBYSC(%a6),%fp1       // ...fp1 is t+OnebySc
742        faddx           %fp1,%fp0               // ...p+(t+OnebySc), fp1 released
743        faddx           (%a1),%fp0      // ...T+(p+(t+OnebySc))
744        bras            EM1SCALE
745MLE63:
746//--Step 6.3    M <= 63
747        cmpil           #-3,%d0
748        bges            MGEN3
749MLTN3:
750//--Step 6.4    M <= -4
751        fadds           12(%a1),%fp0    // ...p+t
752        faddx           (%a1),%fp0      // ...T+(p+t)
753        faddx           ONEBYSC(%a6),%fp0       // ...OnebySc + (T+(p+t))
754        bras            EM1SCALE
755MGEN3:
756//--Step 6.5    -3 <= M <= 63
757        fmovex          (%a1)+,%fp1     // ...fp1 is T
758        fadds           (%a1),%fp0      // ...fp0 is p+t
759        faddx           ONEBYSC(%a6),%fp1       // ...fp1 is T+OnebySc
760        faddx           %fp1,%fp0               // ...(T+OnebySc)+(p+t)
761
762EM1SCALE:
763//--Step 6.6
764        fmovel          %d1,%FPCR
765        fmulx           SC(%a6),%fp0
766
767        bra             t_frcinx
768
769EM1SM:
770//--Step 7      |X| < 1/4.
771        cmpil           #0x3FBE0000,%d0 // ...2^(-65)
772        bges            EM1POLY
773
774EM1TINY:
775//--Step 8      |X| < 2^(-65)
776        cmpil           #0x00330000,%d0 // ...2^(-16312)
777        blts            EM12TINY
778//--Step 8.2
779        movel           #0x80010000,SC(%a6)     // ...SC is -2^(-16382)
780        movel           #0x80000000,SC+4(%a6)
781        clrl            SC+8(%a6)
782        fmovex          (%a0),%fp0
783        fmovel          %d1,%FPCR
784        faddx           SC(%a6),%fp0
785
786        bra             t_frcinx
787
788EM12TINY:
789//--Step 8.3
790        fmovex          (%a0),%fp0
791        fmuld           TWO140,%fp0
792        movel           #0x80010000,SC(%a6)
793        movel           #0x80000000,SC+4(%a6)
794        clrl            SC+8(%a6)
795        faddx           SC(%a6),%fp0
796        fmovel          %d1,%FPCR
797        fmuld           TWON140,%fp0
798
799        bra             t_frcinx
800
801EM1POLY:
802//--Step 9      exp(X)-1 by a simple polynomial
803        fmovex          (%a0),%fp0      // ...fp0 is X
804        fmulx           %fp0,%fp0               // ...fp0 is S := X*X
805        fmovemx %fp2-%fp2/%fp3,-(%a7)   // ...save fp2
806        fmoves          #0x2F30CAA8,%fp1        // ...fp1 is B12
807        fmulx           %fp0,%fp1               // ...fp1 is S*B12
808        fmoves          #0x310F8290,%fp2        // ...fp2 is B11
809        fadds           #0x32D73220,%fp1        // ...fp1 is B10+S*B12
810
811        fmulx           %fp0,%fp2               // ...fp2 is S*B11
812        fmulx           %fp0,%fp1               // ...fp1 is S*(B10 + ...
813
814        fadds           #0x3493F281,%fp2        // ...fp2 is B9+S*...
815        faddd           EM1B8,%fp1      // ...fp1 is B8+S*...
816
817        fmulx           %fp0,%fp2               // ...fp2 is S*(B9+...
818        fmulx           %fp0,%fp1               // ...fp1 is S*(B8+...
819
820        faddd           EM1B7,%fp2      // ...fp2 is B7+S*...
821        faddd           EM1B6,%fp1      // ...fp1 is B6+S*...
822
823        fmulx           %fp0,%fp2               // ...fp2 is S*(B7+...
824        fmulx           %fp0,%fp1               // ...fp1 is S*(B6+...
825
826        faddd           EM1B5,%fp2      // ...fp2 is B5+S*...
827        faddd           EM1B4,%fp1      // ...fp1 is B4+S*...
828
829        fmulx           %fp0,%fp2               // ...fp2 is S*(B5+...
830        fmulx           %fp0,%fp1               // ...fp1 is S*(B4+...
831
832        faddd           EM1B3,%fp2      // ...fp2 is B3+S*...
833        faddx           EM1B2,%fp1      // ...fp1 is B2+S*...
834
835        fmulx           %fp0,%fp2               // ...fp2 is S*(B3+...
836        fmulx           %fp0,%fp1               // ...fp1 is S*(B2+...
837
838        fmulx           %fp0,%fp2               // ...fp2 is S*S*(B3+...)
839        fmulx           (%a0),%fp1      // ...fp1 is X*S*(B2...
840
841        fmuls           #0x3F000000,%fp0        // ...fp0 is S*B1
842        faddx           %fp2,%fp1               // ...fp1 is Q
843//                                      ...fp2 released
844
845        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
846
847        faddx           %fp1,%fp0               // ...fp0 is S*B1+Q
848//                                      ...fp1 released
849
850        fmovel          %d1,%FPCR
851        faddx           (%a0),%fp0
852
853        bra             t_frcinx
854
855EM1BIG:
856//--Step 10     |X| > 70 log2
857        movel           (%a0),%d0
858        cmpil           #0,%d0
859        bgt             EXPC1
860//--Step 10.2
861        fmoves          #0xBF800000,%fp0        // ...fp0 is -1
862        fmovel          %d1,%FPCR
863        fadds           #0x00800000,%fp0        // ...-1 + 2^(-126)
864
865        bra             t_frcinx
866
867        |end
Note: See TracBrowser for help on using the repository browser.