1 | #include "fpsp-namespace.h" |
---|
2 | // |
---|
3 | // |
---|
4 | // setox.sa 3.1 12/10/90 |
---|
5 | // |
---|
6 | // The entry point setox computes the exponential of a value. |
---|
7 | // setoxd does the same except the input value is a denormalized |
---|
8 | // number. setoxm1 computes exp(X)-1, and setoxm1d computes |
---|
9 | // exp(X)-1 for denormalized X. |
---|
10 | // |
---|
11 | // INPUT |
---|
12 | // ----- |
---|
13 | // Double-extended value in memory location pointed to by address |
---|
14 | // register a0. |
---|
15 | // |
---|
16 | // OUTPUT |
---|
17 | // ------ |
---|
18 | // exp(X) or exp(X)-1 returned in floating-point register fp0. |
---|
19 | // |
---|
20 | // ACCURACY and MONOTONICITY |
---|
21 | // ------------------------- |
---|
22 | // The returned result is within 0.85 ulps in 64 significant bit, i.e. |
---|
23 | // within 0.5001 ulp to 53 bits if the result is subsequently rounded |
---|
24 | // to double precision. The result is provably monotonic in double |
---|
25 | // precision. |
---|
26 | // |
---|
27 | // SPEED |
---|
28 | // ----- |
---|
29 | // Two timings are measured, both in the copy-back mode. The |
---|
30 | // first one is measured when the function is invoked the first time |
---|
31 | // (so the instructions and data are not in cache), and the |
---|
32 | // second one is measured when the function is reinvoked at the same |
---|
33 | // input argument. |
---|
34 | // |
---|
35 | // The program setox takes approximately 210/190 cycles for input |
---|
36 | // argument X whose magnitude is less than 16380 log2, which |
---|
37 | // is the usual situation. For the less common arguments, |
---|
38 | // depending on their values, the program may run faster or slower -- |
---|
39 | // but no worse than 10% slower even in the extreme cases. |
---|
40 | // |
---|
41 | // The program setoxm1 takes approximately ???/??? cycles for input |
---|
42 | // argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes |
---|
43 | // approximately ???/??? cycles. For the less common arguments, |
---|
44 | // depending on their values, the program may run faster or slower -- |
---|
45 | // but no worse than 10% slower even in the extreme cases. |
---|
46 | // |
---|
47 | // ALGORITHM and IMPLEMENTATION NOTES |
---|
48 | // ---------------------------------- |
---|
49 | // |
---|
50 | // setoxd |
---|
51 | // ------ |
---|
52 | // Step 1. Set ans := 1.0 |
---|
53 | // |
---|
54 | // Step 2. Return ans := ans + sign(X)*2^(-126). Exit. |
---|
55 | // Notes: This will always generate one exception -- inexact. |
---|
56 | // |
---|
57 | // |
---|
58 | // setox |
---|
59 | // ----- |
---|
60 | // |
---|
61 | // Step 1. Filter out extreme cases of input argument. |
---|
62 | // 1.1 If |X| >= 2^(-65), go to Step 1.3. |
---|
63 | // 1.2 Go to Step 7. |
---|
64 | // 1.3 If |X| < 16380 log(2), go to Step 2. |
---|
65 | // 1.4 Go to Step 8. |
---|
66 | // Notes: The usual case should take the branches 1.1 -> 1.3 -> 2. |
---|
67 | // To avoid the use of floating-point comparisons, a |
---|
68 | // compact representation of |X| is used. This format is a |
---|
69 | // 32-bit integer, the upper (more significant) 16 bits are |
---|
70 | // the sign and biased exponent field of |X|; the lower 16 |
---|
71 | // bits are the 16 most significant fraction (including the |
---|
72 | // explicit bit) bits of |X|. Consequently, the comparisons |
---|
73 | // in Steps 1.1 and 1.3 can be performed by integer comparison. |
---|
74 | // Note also that the constant 16380 log(2) used in Step 1.3 |
---|
75 | // is also in the compact form. Thus taking the branch |
---|
76 | // to Step 2 guarantees |X| < 16380 log(2). There is no harm |
---|
77 | // to have a small number of cases where |X| is less than, |
---|
78 | // but close to, 16380 log(2) and the branch to Step 9 is |
---|
79 | // taken. |
---|
80 | // |
---|
81 | // Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ). |
---|
82 | // 2.1 Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken) |
---|
83 | // 2.2 N := round-to-nearest-integer( X * 64/log2 ). |
---|
84 | // 2.3 Calculate J = N mod 64; so J = 0,1,2,..., or 63. |
---|
85 | // 2.4 Calculate M = (N - J)/64; so N = 64M + J. |
---|
86 | // 2.5 Calculate the address of the stored value of 2^(J/64). |
---|
87 | // 2.6 Create the value Scale = 2^M. |
---|
88 | // Notes: The calculation in 2.2 is really performed by |
---|
89 | // |
---|
90 | // Z := X * constant |
---|
91 | // N := round-to-nearest-integer(Z) |
---|
92 | // |
---|
93 | // where |
---|
94 | // |
---|
95 | // constant := single-precision( 64/log 2 ). |
---|
96 | // |
---|
97 | // Using a single-precision constant avoids memory access. |
---|
98 | // Another effect of using a single-precision "constant" is |
---|
99 | // that the calculated value Z is |
---|
100 | // |
---|
101 | // Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24). |
---|
102 | // |
---|
103 | // This error has to be considered later in Steps 3 and 4. |
---|
104 | // |
---|
105 | // Step 3. Calculate X - N*log2/64. |
---|
106 | // 3.1 R := X + N*L1, where L1 := single-precision(-log2/64). |
---|
107 | // 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1). |
---|
108 | // Notes: a) The way L1 and L2 are chosen ensures L1+L2 approximate |
---|
109 | // the value -log2/64 to 88 bits of accuracy. |
---|
110 | // b) N*L1 is exact because N is no longer than 22 bits and |
---|
111 | // L1 is no longer than 24 bits. |
---|
112 | // c) The calculation X+N*L1 is also exact due to cancellation. |
---|
113 | // Thus, R is practically X+N(L1+L2) to full 64 bits. |
---|
114 | // d) It is important to estimate how large can |R| be after |
---|
115 | // Step 3.2. |
---|
116 | // |
---|
117 | // N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24) |
---|
118 | // X*64/log2 (1+eps) = N + f, |f| <= 0.5 |
---|
119 | // X*64/log2 - N = f - eps*X 64/log2 |
---|
120 | // X - N*log2/64 = f*log2/64 - eps*X |
---|
121 | // |
---|
122 | // |
---|
123 | // Now |X| <= 16446 log2, thus |
---|
124 | // |
---|
125 | // |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64 |
---|
126 | // <= 0.57 log2/64. |
---|
127 | // This bound will be used in Step 4. |
---|
128 | // |
---|
129 | // Step 4. Approximate exp(R)-1 by a polynomial |
---|
130 | // p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5)))) |
---|
131 | // Notes: a) In order to reduce memory access, the coefficients are |
---|
132 | // made as "short" as possible: A1 (which is 1/2), A4 and A5 |
---|
133 | // are single precision; A2 and A3 are double precision. |
---|
134 | // b) Even with the restrictions above, |
---|
135 | // |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062. |
---|
136 | // Note that 0.0062 is slightly bigger than 0.57 log2/64. |
---|
137 | // c) To fully utilize the pipeline, p is separated into |
---|
138 | // two independent pieces of roughly equal complexities |
---|
139 | // p = [ R + R*S*(A2 + S*A4) ] + |
---|
140 | // [ S*(A1 + S*(A3 + S*A5)) ] |
---|
141 | // where S = R*R. |
---|
142 | // |
---|
143 | // Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by |
---|
144 | // ans := T + ( T*p + t) |
---|
145 | // where T and t are the stored values for 2^(J/64). |
---|
146 | // Notes: 2^(J/64) is stored as T and t where T+t approximates |
---|
147 | // 2^(J/64) to roughly 85 bits; T is in extended precision |
---|
148 | // and t is in single precision. Note also that T is rounded |
---|
149 | // to 62 bits so that the last two bits of T are zero. The |
---|
150 | // reason for such a special form is that T-1, T-2, and T-8 |
---|
151 | // will all be exact --- a property that will give much |
---|
152 | // more accurate computation of the function EXPM1. |
---|
153 | // |
---|
154 | // Step 6. Reconstruction of exp(X) |
---|
155 | // exp(X) = 2^M * 2^(J/64) * exp(R). |
---|
156 | // 6.1 If AdjFlag = 0, go to 6.3 |
---|
157 | // 6.2 ans := ans * AdjScale |
---|
158 | // 6.3 Restore the user FPCR |
---|
159 | // 6.4 Return ans := ans * Scale. Exit. |
---|
160 | // Notes: If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R, |
---|
161 | // |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will |
---|
162 | // neither overflow nor underflow. If AdjFlag = 1, that |
---|
163 | // means that |
---|
164 | // X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380. |
---|
165 | // Hence, exp(X) may overflow or underflow or neither. |
---|
166 | // When that is the case, AdjScale = 2^(M1) where M1 is |
---|
167 | // approximately M. Thus 6.2 will never cause over/underflow. |
---|
168 | // Possible exception in 6.4 is overflow or underflow. |
---|
169 | // The inexact exception is not generated in 6.4. Although |
---|
170 | // one can argue that the inexact flag should always be |
---|
171 | // raised, to simulate that exception cost to much than the |
---|
172 | // flag is worth in practical uses. |
---|
173 | // |
---|
174 | // Step 7. Return 1 + X. |
---|
175 | // 7.1 ans := X |
---|
176 | // 7.2 Restore user FPCR. |
---|
177 | // 7.3 Return ans := 1 + ans. Exit |
---|
178 | // Notes: For non-zero X, the inexact exception will always be |
---|
179 | // raised by 7.3. That is the only exception raised by 7.3. |
---|
180 | // Note also that we use the FMOVEM instruction to move X |
---|
181 | // in Step 7.1 to avoid unnecessary trapping. (Although |
---|
182 | // the FMOVEM may not seem relevant since X is normalized, |
---|
183 | // the precaution will be useful in the library version of |
---|
184 | // this code where the separate entry for denormalized inputs |
---|
185 | // will be done away with.) |
---|
186 | // |
---|
187 | // Step 8. Handle exp(X) where |X| >= 16380log2. |
---|
188 | // 8.1 If |X| > 16480 log2, go to Step 9. |
---|
189 | // (mimic 2.2 - 2.6) |
---|
190 | // 8.2 N := round-to-integer( X * 64/log2 ) |
---|
191 | // 8.3 Calculate J = N mod 64, J = 0,1,...,63 |
---|
192 | // 8.4 K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1. |
---|
193 | // 8.5 Calculate the address of the stored value 2^(J/64). |
---|
194 | // 8.6 Create the values Scale = 2^M, AdjScale = 2^M1. |
---|
195 | // 8.7 Go to Step 3. |
---|
196 | // Notes: Refer to notes for 2.2 - 2.6. |
---|
197 | // |
---|
198 | // Step 9. Handle exp(X), |X| > 16480 log2. |
---|
199 | // 9.1 If X < 0, go to 9.3 |
---|
200 | // 9.2 ans := Huge, go to 9.4 |
---|
201 | // 9.3 ans := Tiny. |
---|
202 | // 9.4 Restore user FPCR. |
---|
203 | // 9.5 Return ans := ans * ans. Exit. |
---|
204 | // Notes: Exp(X) will surely overflow or underflow, depending on |
---|
205 | // X's sign. "Huge" and "Tiny" are respectively large/tiny |
---|
206 | // extended-precision numbers whose square over/underflow |
---|
207 | // with an inexact result. Thus, 9.5 always raises the |
---|
208 | // inexact together with either overflow or underflow. |
---|
209 | // |
---|
210 | // |
---|
211 | // setoxm1d |
---|
212 | // -------- |
---|
213 | // |
---|
214 | // Step 1. Set ans := 0 |
---|
215 | // |
---|
216 | // Step 2. Return ans := X + ans. Exit. |
---|
217 | // Notes: This will return X with the appropriate rounding |
---|
218 | // precision prescribed by the user FPCR. |
---|
219 | // |
---|
220 | // setoxm1 |
---|
221 | // ------- |
---|
222 | // |
---|
223 | // Step 1. Check |X| |
---|
224 | // 1.1 If |X| >= 1/4, go to Step 1.3. |
---|
225 | // 1.2 Go to Step 7. |
---|
226 | // 1.3 If |X| < 70 log(2), go to Step 2. |
---|
227 | // 1.4 Go to Step 10. |
---|
228 | // Notes: The usual case should take the branches 1.1 -> 1.3 -> 2. |
---|
229 | // However, it is conceivable |X| can be small very often |
---|
230 | // because EXPM1 is intended to evaluate exp(X)-1 accurately |
---|
231 | // when |X| is small. For further details on the comparisons, |
---|
232 | // see the notes on Step 1 of setox. |
---|
233 | // |
---|
234 | // Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ). |
---|
235 | // 2.1 N := round-to-nearest-integer( X * 64/log2 ). |
---|
236 | // 2.2 Calculate J = N mod 64; so J = 0,1,2,..., or 63. |
---|
237 | // 2.3 Calculate M = (N - J)/64; so N = 64M + J. |
---|
238 | // 2.4 Calculate the address of the stored value of 2^(J/64). |
---|
239 | // 2.5 Create the values Sc = 2^M and OnebySc := -2^(-M). |
---|
240 | // Notes: See the notes on Step 2 of setox. |
---|
241 | // |
---|
242 | // Step 3. Calculate X - N*log2/64. |
---|
243 | // 3.1 R := X + N*L1, where L1 := single-precision(-log2/64). |
---|
244 | // 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1). |
---|
245 | // Notes: Applying the analysis of Step 3 of setox in this case |
---|
246 | // shows that |R| <= 0.0055 (note that |X| <= 70 log2 in |
---|
247 | // this case). |
---|
248 | // |
---|
249 | // Step 4. Approximate exp(R)-1 by a polynomial |
---|
250 | // p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6))))) |
---|
251 | // Notes: a) In order to reduce memory access, the coefficients are |
---|
252 | // made as "short" as possible: A1 (which is 1/2), A5 and A6 |
---|
253 | // are single precision; A2, A3 and A4 are double precision. |
---|
254 | // b) Even with the restriction above, |
---|
255 | // |p - (exp(R)-1)| < |R| * 2^(-72.7) |
---|
256 | // for all |R| <= 0.0055. |
---|
257 | // c) To fully utilize the pipeline, p is separated into |
---|
258 | // two independent pieces of roughly equal complexity |
---|
259 | // p = [ R*S*(A2 + S*(A4 + S*A6)) ] + |
---|
260 | // [ R + S*(A1 + S*(A3 + S*A5)) ] |
---|
261 | // where S = R*R. |
---|
262 | // |
---|
263 | // Step 5. Compute 2^(J/64)*p by |
---|
264 | // p := T*p |
---|
265 | // where T and t are the stored values for 2^(J/64). |
---|
266 | // Notes: 2^(J/64) is stored as T and t where T+t approximates |
---|
267 | // 2^(J/64) to roughly 85 bits; T is in extended precision |
---|
268 | // and t is in single precision. Note also that T is rounded |
---|
269 | // to 62 bits so that the last two bits of T are zero. The |
---|
270 | // reason for such a special form is that T-1, T-2, and T-8 |
---|
271 | // will all be exact --- a property that will be exploited |
---|
272 | // in Step 6 below. The total relative error in p is no |
---|
273 | // bigger than 2^(-67.7) compared to the final result. |
---|
274 | // |
---|
275 | // Step 6. Reconstruction of exp(X)-1 |
---|
276 | // exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ). |
---|
277 | // 6.1 If M <= 63, go to Step 6.3. |
---|
278 | // 6.2 ans := T + (p + (t + OnebySc)). Go to 6.6 |
---|
279 | // 6.3 If M >= -3, go to 6.5. |
---|
280 | // 6.4 ans := (T + (p + t)) + OnebySc. Go to 6.6 |
---|
281 | // 6.5 ans := (T + OnebySc) + (p + t). |
---|
282 | // 6.6 Restore user FPCR. |
---|
283 | // 6.7 Return ans := Sc * ans. Exit. |
---|
284 | // Notes: The various arrangements of the expressions give accurate |
---|
285 | // evaluations. |
---|
286 | // |
---|
287 | // Step 7. exp(X)-1 for |X| < 1/4. |
---|
288 | // 7.1 If |X| >= 2^(-65), go to Step 9. |
---|
289 | // 7.2 Go to Step 8. |
---|
290 | // |
---|
291 | // Step 8. Calculate exp(X)-1, |X| < 2^(-65). |
---|
292 | // 8.1 If |X| < 2^(-16312), goto 8.3 |
---|
293 | // 8.2 Restore FPCR; return ans := X - 2^(-16382). Exit. |
---|
294 | // 8.3 X := X * 2^(140). |
---|
295 | // 8.4 Restore FPCR; ans := ans - 2^(-16382). |
---|
296 | // Return ans := ans*2^(140). Exit |
---|
297 | // Notes: The idea is to return "X - tiny" under the user |
---|
298 | // precision and rounding modes. To avoid unnecessary |
---|
299 | // inefficiency, we stay away from denormalized numbers the |
---|
300 | // best we can. For |X| >= 2^(-16312), the straightforward |
---|
301 | // 8.2 generates the inexact exception as the case warrants. |
---|
302 | // |
---|
303 | // Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial |
---|
304 | // p = X + X*X*(B1 + X*(B2 + ... + X*B12)) |
---|
305 | // Notes: a) In order to reduce memory access, the coefficients are |
---|
306 | // made as "short" as possible: B1 (which is 1/2), B9 to B12 |
---|
307 | // are single precision; B3 to B8 are double precision; and |
---|
308 | // B2 is double extended. |
---|
309 | // b) Even with the restriction above, |
---|
310 | // |p - (exp(X)-1)| < |X| 2^(-70.6) |
---|
311 | // for all |X| <= 0.251. |
---|
312 | // Note that 0.251 is slightly bigger than 1/4. |
---|
313 | // c) To fully preserve accuracy, the polynomial is computed |
---|
314 | // as X + ( S*B1 + Q ) where S = X*X and |
---|
315 | // Q = X*S*(B2 + X*(B3 + ... + X*B12)) |
---|
316 | // d) To fully utilize the pipeline, Q is separated into |
---|
317 | // two independent pieces of roughly equal complexity |
---|
318 | // Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] + |
---|
319 | // [ S*S*(B3 + S*(B5 + ... + S*B11)) ] |
---|
320 | // |
---|
321 | // Step 10. Calculate exp(X)-1 for |X| >= 70 log 2. |
---|
322 | // 10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical |
---|
323 | // purposes. Therefore, go to Step 1 of setox. |
---|
324 | // 10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes. |
---|
325 | // ans := -1 |
---|
326 | // Restore user FPCR |
---|
327 | // Return ans := ans + 2^(-126). Exit. |
---|
328 | // Notes: 10.2 will always create an inexact and return -1 + tiny |
---|
329 | // in the user rounding precision and mode. |
---|
330 | // |
---|
331 | // |
---|
332 | |
---|
333 | // Copyright (C) Motorola, Inc. 1990 |
---|
334 | // All Rights Reserved |
---|
335 | // |
---|
336 | // THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA |
---|
337 | // The copyright notice above does not evidence any |
---|
338 | // actual or intended publication of such source code. |
---|
339 | |
---|
340 | //setox idnt 2,1 | Motorola 040 Floating Point Software Package |
---|
341 | |
---|
342 | |section 8 |
---|
343 | |
---|
344 | #include "fpsp.defs" |
---|
345 | |
---|
346 | L2: .long 0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000 |
---|
347 | |
---|
348 | EXPA3: .long 0x3FA55555,0x55554431 |
---|
349 | EXPA2: .long 0x3FC55555,0x55554018 |
---|
350 | |
---|
351 | HUGE: .long 0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000 |
---|
352 | TINY: .long 0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000 |
---|
353 | |
---|
354 | EM1A4: .long 0x3F811111,0x11174385 |
---|
355 | EM1A3: .long 0x3FA55555,0x55554F5A |
---|
356 | |
---|
357 | EM1A2: .long 0x3FC55555,0x55555555,0x00000000,0x00000000 |
---|
358 | |
---|
359 | EM1B8: .long 0x3EC71DE3,0xA5774682 |
---|
360 | EM1B7: .long 0x3EFA01A0,0x19D7CB68 |
---|
361 | |
---|
362 | EM1B6: .long 0x3F2A01A0,0x1A019DF3 |
---|
363 | EM1B5: .long 0x3F56C16C,0x16C170E2 |
---|
364 | |
---|
365 | EM1B4: .long 0x3F811111,0x11111111 |
---|
366 | EM1B3: .long 0x3FA55555,0x55555555 |
---|
367 | |
---|
368 | EM1B2: .long 0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB |
---|
369 | .long 0x00000000 |
---|
370 | |
---|
371 | TWO140: .long 0x48B00000,0x00000000 |
---|
372 | TWON140: .long 0x37300000,0x00000000 |
---|
373 | |
---|
374 | EXPTBL: |
---|
375 | .long 0x3FFF0000,0x80000000,0x00000000,0x00000000 |
---|
376 | .long 0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B |
---|
377 | .long 0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9 |
---|
378 | .long 0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369 |
---|
379 | .long 0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C |
---|
380 | .long 0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F |
---|
381 | .long 0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729 |
---|
382 | .long 0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF |
---|
383 | .long 0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF |
---|
384 | .long 0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA |
---|
385 | .long 0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051 |
---|
386 | .long 0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029 |
---|
387 | .long 0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494 |
---|
388 | .long 0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0 |
---|
389 | .long 0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D |
---|
390 | .long 0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537 |
---|
391 | .long 0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD |
---|
392 | .long 0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087 |
---|
393 | .long 0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818 |
---|
394 | .long 0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D |
---|
395 | .long 0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890 |
---|
396 | .long 0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C |
---|
397 | .long 0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05 |
---|
398 | .long 0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126 |
---|
399 | .long 0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140 |
---|
400 | .long 0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA |
---|
401 | .long 0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A |
---|
402 | .long 0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC |
---|
403 | .long 0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC |
---|
404 | .long 0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610 |
---|
405 | .long 0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90 |
---|
406 | .long 0x3FFF0000,0xB311C412,0xA9112488,0x201F678A |
---|
407 | .long 0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13 |
---|
408 | .long 0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30 |
---|
409 | .long 0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC |
---|
410 | .long 0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6 |
---|
411 | .long 0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70 |
---|
412 | .long 0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518 |
---|
413 | .long 0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41 |
---|
414 | .long 0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B |
---|
415 | .long 0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568 |
---|
416 | .long 0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E |
---|
417 | .long 0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03 |
---|
418 | .long 0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D |
---|
419 | .long 0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4 |
---|
420 | .long 0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C |
---|
421 | .long 0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9 |
---|
422 | .long 0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21 |
---|
423 | .long 0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F |
---|
424 | .long 0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F |
---|
425 | .long 0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207 |
---|
426 | .long 0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175 |
---|
427 | .long 0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B |
---|
428 | .long 0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5 |
---|
429 | .long 0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A |
---|
430 | .long 0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22 |
---|
431 | .long 0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945 |
---|
432 | .long 0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B |
---|
433 | .long 0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3 |
---|
434 | .long 0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05 |
---|
435 | .long 0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19 |
---|
436 | .long 0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5 |
---|
437 | .long 0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22 |
---|
438 | .long 0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A |
---|
439 | |
---|
440 | .set ADJFLAG,L_SCR2 |
---|
441 | .set SCALE,FP_SCR1 |
---|
442 | .set ADJSCALE,FP_SCR2 |
---|
443 | .set SC,FP_SCR3 |
---|
444 | .set ONEBYSC,FP_SCR4 |
---|
445 | |
---|
446 | | xref t_frcinx |
---|
447 | |xref t_extdnrm |
---|
448 | |xref t_unfl |
---|
449 | |xref t_ovfl |
---|
450 | |
---|
451 | .global setoxd |
---|
452 | setoxd: |
---|
453 | //--entry point for EXP(X), X is denormalized |
---|
454 | movel (%a0),%d0 |
---|
455 | andil #0x80000000,%d0 |
---|
456 | oril #0x00800000,%d0 // ...sign(X)*2^(-126) |
---|
457 | movel %d0,-(%sp) |
---|
458 | fmoves #0x3F800000,%fp0 |
---|
459 | fmovel %d1,%fpcr |
---|
460 | fadds (%sp)+,%fp0 |
---|
461 | bra t_frcinx |
---|
462 | |
---|
463 | .global setox |
---|
464 | setox: |
---|
465 | //--entry point for EXP(X), here X is finite, non-zero, and not NaN's |
---|
466 | |
---|
467 | //--Step 1. |
---|
468 | movel (%a0),%d0 // ...load part of input X |
---|
469 | andil #0x7FFF0000,%d0 // ...biased expo. of X |
---|
470 | cmpil #0x3FBE0000,%d0 // ...2^(-65) |
---|
471 | bges EXPC1 // ...normal case |
---|
472 | bra EXPSM |
---|
473 | |
---|
474 | EXPC1: |
---|
475 | //--The case |X| >= 2^(-65) |
---|
476 | movew 4(%a0),%d0 // ...expo. and partial sig. of |X| |
---|
477 | cmpil #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits |
---|
478 | blts EXPMAIN // ...normal case |
---|
479 | bra EXPBIG |
---|
480 | |
---|
481 | EXPMAIN: |
---|
482 | //--Step 2. |
---|
483 | //--This is the normal branch: 2^(-65) <= |X| < 16380 log2. |
---|
484 | fmovex (%a0),%fp0 // ...load input from (a0) |
---|
485 | |
---|
486 | fmovex %fp0,%fp1 |
---|
487 | fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X |
---|
488 | fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2 |
---|
489 | movel #0,ADJFLAG(%a6) |
---|
490 | fmovel %fp0,%d0 // ...N = int( X * 64/log2 ) |
---|
491 | lea EXPTBL,%a1 |
---|
492 | fmovel %d0,%fp0 // ...convert to floating-format |
---|
493 | |
---|
494 | movel %d0,L_SCR1(%a6) // ...save N temporarily |
---|
495 | andil #0x3F,%d0 // ...D0 is J = N mod 64 |
---|
496 | lsll #4,%d0 |
---|
497 | addal %d0,%a1 // ...address of 2^(J/64) |
---|
498 | movel L_SCR1(%a6),%d0 |
---|
499 | asrl #6,%d0 // ...D0 is M |
---|
500 | addiw #0x3FFF,%d0 // ...biased expo. of 2^(M) |
---|
501 | movew L2,L_SCR1(%a6) // ...prefetch L2, no need in CB |
---|
502 | |
---|
503 | EXPCONT1: |
---|
504 | //--Step 3. |
---|
505 | //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X, |
---|
506 | //--a0 points to 2^(J/64), D0 is biased expo. of 2^(M) |
---|
507 | fmovex %fp0,%fp2 |
---|
508 | fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64) |
---|
509 | fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64 |
---|
510 | faddx %fp1,%fp0 // ...X + N*L1 |
---|
511 | faddx %fp2,%fp0 // ...fp0 is R, reduced arg. |
---|
512 | // MOVE.W #$3FA5,EXPA3 ...load EXPA3 in cache |
---|
513 | |
---|
514 | //--Step 4. |
---|
515 | //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL |
---|
516 | //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5)))) |
---|
517 | //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R |
---|
518 | //--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))] |
---|
519 | |
---|
520 | fmovex %fp0,%fp1 |
---|
521 | fmulx %fp1,%fp1 // ...fp1 IS S = R*R |
---|
522 | |
---|
523 | fmoves #0x3AB60B70,%fp2 // ...fp2 IS A5 |
---|
524 | // MOVE.W #0,2(%a1) ...load 2^(J/64) in cache |
---|
525 | |
---|
526 | fmulx %fp1,%fp2 // ...fp2 IS S*A5 |
---|
527 | fmovex %fp1,%fp3 |
---|
528 | fmuls #0x3C088895,%fp3 // ...fp3 IS S*A4 |
---|
529 | |
---|
530 | faddd EXPA3,%fp2 // ...fp2 IS A3+S*A5 |
---|
531 | faddd EXPA2,%fp3 // ...fp3 IS A2+S*A4 |
---|
532 | |
---|
533 | fmulx %fp1,%fp2 // ...fp2 IS S*(A3+S*A5) |
---|
534 | movew %d0,SCALE(%a6) // ...SCALE is 2^(M) in extended |
---|
535 | clrw SCALE+2(%a6) |
---|
536 | movel #0x80000000,SCALE+4(%a6) |
---|
537 | clrl SCALE+8(%a6) |
---|
538 | |
---|
539 | fmulx %fp1,%fp3 // ...fp3 IS S*(A2+S*A4) |
---|
540 | |
---|
541 | fadds #0x3F000000,%fp2 // ...fp2 IS A1+S*(A3+S*A5) |
---|
542 | fmulx %fp0,%fp3 // ...fp3 IS R*S*(A2+S*A4) |
---|
543 | |
---|
544 | fmulx %fp1,%fp2 // ...fp2 IS S*(A1+S*(A3+S*A5)) |
---|
545 | faddx %fp3,%fp0 // ...fp0 IS R+R*S*(A2+S*A4), |
---|
546 | // ...fp3 released |
---|
547 | |
---|
548 | fmovex (%a1)+,%fp1 // ...fp1 is lead. pt. of 2^(J/64) |
---|
549 | faddx %fp2,%fp0 // ...fp0 is EXP(R) - 1 |
---|
550 | // ...fp2 released |
---|
551 | |
---|
552 | //--Step 5 |
---|
553 | //--final reconstruction process |
---|
554 | //--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) ) |
---|
555 | |
---|
556 | fmulx %fp1,%fp0 // ...2^(J/64)*(Exp(R)-1) |
---|
557 | fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored |
---|
558 | fadds (%a1),%fp0 // ...accurate 2^(J/64) |
---|
559 | |
---|
560 | faddx %fp1,%fp0 // ...2^(J/64) + 2^(J/64)*... |
---|
561 | movel ADJFLAG(%a6),%d0 |
---|
562 | |
---|
563 | //--Step 6 |
---|
564 | tstl %d0 |
---|
565 | beqs NORMAL |
---|
566 | ADJUST: |
---|
567 | fmulx ADJSCALE(%a6),%fp0 |
---|
568 | NORMAL: |
---|
569 | fmovel %d1,%FPCR // ...restore user FPCR |
---|
570 | fmulx SCALE(%a6),%fp0 // ...multiply 2^(M) |
---|
571 | bra t_frcinx |
---|
572 | |
---|
573 | EXPSM: |
---|
574 | //--Step 7 |
---|
575 | fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized |
---|
576 | fmovel %d1,%FPCR |
---|
577 | fadds #0x3F800000,%fp0 // ...1+X in user mode |
---|
578 | bra t_frcinx |
---|
579 | |
---|
580 | EXPBIG: |
---|
581 | //--Step 8 |
---|
582 | cmpil #0x400CB27C,%d0 // ...16480 log2 |
---|
583 | bgts EXP2BIG |
---|
584 | //--Steps 8.2 -- 8.6 |
---|
585 | fmovex (%a0),%fp0 // ...load input from (a0) |
---|
586 | |
---|
587 | fmovex %fp0,%fp1 |
---|
588 | fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X |
---|
589 | fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2 |
---|
590 | movel #1,ADJFLAG(%a6) |
---|
591 | fmovel %fp0,%d0 // ...N = int( X * 64/log2 ) |
---|
592 | lea EXPTBL,%a1 |
---|
593 | fmovel %d0,%fp0 // ...convert to floating-format |
---|
594 | movel %d0,L_SCR1(%a6) // ...save N temporarily |
---|
595 | andil #0x3F,%d0 // ...D0 is J = N mod 64 |
---|
596 | lsll #4,%d0 |
---|
597 | addal %d0,%a1 // ...address of 2^(J/64) |
---|
598 | movel L_SCR1(%a6),%d0 |
---|
599 | asrl #6,%d0 // ...D0 is K |
---|
600 | movel %d0,L_SCR1(%a6) // ...save K temporarily |
---|
601 | asrl #1,%d0 // ...D0 is M1 |
---|
602 | subl %d0,L_SCR1(%a6) // ...a1 is M |
---|
603 | addiw #0x3FFF,%d0 // ...biased expo. of 2^(M1) |
---|
604 | movew %d0,ADJSCALE(%a6) // ...ADJSCALE := 2^(M1) |
---|
605 | clrw ADJSCALE+2(%a6) |
---|
606 | movel #0x80000000,ADJSCALE+4(%a6) |
---|
607 | clrl ADJSCALE+8(%a6) |
---|
608 | movel L_SCR1(%a6),%d0 // ...D0 is M |
---|
609 | addiw #0x3FFF,%d0 // ...biased expo. of 2^(M) |
---|
610 | bra EXPCONT1 // ...go back to Step 3 |
---|
611 | |
---|
612 | EXP2BIG: |
---|
613 | //--Step 9 |
---|
614 | fmovel %d1,%FPCR |
---|
615 | movel (%a0),%d0 |
---|
616 | bclrb #sign_bit,(%a0) // ...setox always returns positive |
---|
617 | cmpil #0,%d0 |
---|
618 | blt t_unfl |
---|
619 | bra t_ovfl |
---|
620 | |
---|
621 | .global setoxm1d |
---|
622 | setoxm1d: |
---|
623 | //--entry point for EXPM1(X), here X is denormalized |
---|
624 | //--Step 0. |
---|
625 | bra t_extdnrm |
---|
626 | |
---|
627 | |
---|
628 | .global setoxm1 |
---|
629 | setoxm1: |
---|
630 | //--entry point for EXPM1(X), here X is finite, non-zero, non-NaN |
---|
631 | |
---|
632 | //--Step 1. |
---|
633 | //--Step 1.1 |
---|
634 | movel (%a0),%d0 // ...load part of input X |
---|
635 | andil #0x7FFF0000,%d0 // ...biased expo. of X |
---|
636 | cmpil #0x3FFD0000,%d0 // ...1/4 |
---|
637 | bges EM1CON1 // ...|X| >= 1/4 |
---|
638 | bra EM1SM |
---|
639 | |
---|
640 | EM1CON1: |
---|
641 | //--Step 1.3 |
---|
642 | //--The case |X| >= 1/4 |
---|
643 | movew 4(%a0),%d0 // ...expo. and partial sig. of |X| |
---|
644 | cmpil #0x4004C215,%d0 // ...70log2 rounded up to 16 bits |
---|
645 | bles EM1MAIN // ...1/4 <= |X| <= 70log2 |
---|
646 | bra EM1BIG |
---|
647 | |
---|
648 | EM1MAIN: |
---|
649 | //--Step 2. |
---|
650 | //--This is the case: 1/4 <= |X| <= 70 log2. |
---|
651 | fmovex (%a0),%fp0 // ...load input from (a0) |
---|
652 | |
---|
653 | fmovex %fp0,%fp1 |
---|
654 | fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X |
---|
655 | fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2 |
---|
656 | // MOVE.W #$3F81,EM1A4 ...prefetch in CB mode |
---|
657 | fmovel %fp0,%d0 // ...N = int( X * 64/log2 ) |
---|
658 | lea EXPTBL,%a1 |
---|
659 | fmovel %d0,%fp0 // ...convert to floating-format |
---|
660 | |
---|
661 | movel %d0,L_SCR1(%a6) // ...save N temporarily |
---|
662 | andil #0x3F,%d0 // ...D0 is J = N mod 64 |
---|
663 | lsll #4,%d0 |
---|
664 | addal %d0,%a1 // ...address of 2^(J/64) |
---|
665 | movel L_SCR1(%a6),%d0 |
---|
666 | asrl #6,%d0 // ...D0 is M |
---|
667 | movel %d0,L_SCR1(%a6) // ...save a copy of M |
---|
668 | // MOVE.W #$3FDC,L2 ...prefetch L2 in CB mode |
---|
669 | |
---|
670 | //--Step 3. |
---|
671 | //--fp1,fp2 saved on the stack. fp0 is N, fp1 is X, |
---|
672 | //--a0 points to 2^(J/64), D0 and a1 both contain M |
---|
673 | fmovex %fp0,%fp2 |
---|
674 | fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64) |
---|
675 | fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64 |
---|
676 | faddx %fp1,%fp0 // ...X + N*L1 |
---|
677 | faddx %fp2,%fp0 // ...fp0 is R, reduced arg. |
---|
678 | // MOVE.W #$3FC5,EM1A2 ...load EM1A2 in cache |
---|
679 | addiw #0x3FFF,%d0 // ...D0 is biased expo. of 2^M |
---|
680 | |
---|
681 | //--Step 4. |
---|
682 | //--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL |
---|
683 | //-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6))))) |
---|
684 | //--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R |
---|
685 | //--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))] |
---|
686 | |
---|
687 | fmovex %fp0,%fp1 |
---|
688 | fmulx %fp1,%fp1 // ...fp1 IS S = R*R |
---|
689 | |
---|
690 | fmoves #0x3950097B,%fp2 // ...fp2 IS a6 |
---|
691 | // MOVE.W #0,2(%a1) ...load 2^(J/64) in cache |
---|
692 | |
---|
693 | fmulx %fp1,%fp2 // ...fp2 IS S*A6 |
---|
694 | fmovex %fp1,%fp3 |
---|
695 | fmuls #0x3AB60B6A,%fp3 // ...fp3 IS S*A5 |
---|
696 | |
---|
697 | faddd EM1A4,%fp2 // ...fp2 IS A4+S*A6 |
---|
698 | faddd EM1A3,%fp3 // ...fp3 IS A3+S*A5 |
---|
699 | movew %d0,SC(%a6) // ...SC is 2^(M) in extended |
---|
700 | clrw SC+2(%a6) |
---|
701 | movel #0x80000000,SC+4(%a6) |
---|
702 | clrl SC+8(%a6) |
---|
703 | |
---|
704 | fmulx %fp1,%fp2 // ...fp2 IS S*(A4+S*A6) |
---|
705 | movel L_SCR1(%a6),%d0 // ...D0 is M |
---|
706 | negw %d0 // ...D0 is -M |
---|
707 | fmulx %fp1,%fp3 // ...fp3 IS S*(A3+S*A5) |
---|
708 | addiw #0x3FFF,%d0 // ...biased expo. of 2^(-M) |
---|
709 | faddd EM1A2,%fp2 // ...fp2 IS A2+S*(A4+S*A6) |
---|
710 | fadds #0x3F000000,%fp3 // ...fp3 IS A1+S*(A3+S*A5) |
---|
711 | |
---|
712 | fmulx %fp1,%fp2 // ...fp2 IS S*(A2+S*(A4+S*A6)) |
---|
713 | oriw #0x8000,%d0 // ...signed/expo. of -2^(-M) |
---|
714 | movew %d0,ONEBYSC(%a6) // ...OnebySc is -2^(-M) |
---|
715 | clrw ONEBYSC+2(%a6) |
---|
716 | movel #0x80000000,ONEBYSC+4(%a6) |
---|
717 | clrl ONEBYSC+8(%a6) |
---|
718 | fmulx %fp3,%fp1 // ...fp1 IS S*(A1+S*(A3+S*A5)) |
---|
719 | // ...fp3 released |
---|
720 | |
---|
721 | fmulx %fp0,%fp2 // ...fp2 IS R*S*(A2+S*(A4+S*A6)) |
---|
722 | faddx %fp1,%fp0 // ...fp0 IS R+S*(A1+S*(A3+S*A5)) |
---|
723 | // ...fp1 released |
---|
724 | |
---|
725 | faddx %fp2,%fp0 // ...fp0 IS EXP(R)-1 |
---|
726 | // ...fp2 released |
---|
727 | fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored |
---|
728 | |
---|
729 | //--Step 5 |
---|
730 | //--Compute 2^(J/64)*p |
---|
731 | |
---|
732 | fmulx (%a1),%fp0 // ...2^(J/64)*(Exp(R)-1) |
---|
733 | |
---|
734 | //--Step 6 |
---|
735 | //--Step 6.1 |
---|
736 | movel L_SCR1(%a6),%d0 // ...retrieve M |
---|
737 | cmpil #63,%d0 |
---|
738 | bles MLE63 |
---|
739 | //--Step 6.2 M >= 64 |
---|
740 | fmoves 12(%a1),%fp1 // ...fp1 is t |
---|
741 | faddx ONEBYSC(%a6),%fp1 // ...fp1 is t+OnebySc |
---|
742 | faddx %fp1,%fp0 // ...p+(t+OnebySc), fp1 released |
---|
743 | faddx (%a1),%fp0 // ...T+(p+(t+OnebySc)) |
---|
744 | bras EM1SCALE |
---|
745 | MLE63: |
---|
746 | //--Step 6.3 M <= 63 |
---|
747 | cmpil #-3,%d0 |
---|
748 | bges MGEN3 |
---|
749 | MLTN3: |
---|
750 | //--Step 6.4 M <= -4 |
---|
751 | fadds 12(%a1),%fp0 // ...p+t |
---|
752 | faddx (%a1),%fp0 // ...T+(p+t) |
---|
753 | faddx ONEBYSC(%a6),%fp0 // ...OnebySc + (T+(p+t)) |
---|
754 | bras EM1SCALE |
---|
755 | MGEN3: |
---|
756 | //--Step 6.5 -3 <= M <= 63 |
---|
757 | fmovex (%a1)+,%fp1 // ...fp1 is T |
---|
758 | fadds (%a1),%fp0 // ...fp0 is p+t |
---|
759 | faddx ONEBYSC(%a6),%fp1 // ...fp1 is T+OnebySc |
---|
760 | faddx %fp1,%fp0 // ...(T+OnebySc)+(p+t) |
---|
761 | |
---|
762 | EM1SCALE: |
---|
763 | //--Step 6.6 |
---|
764 | fmovel %d1,%FPCR |
---|
765 | fmulx SC(%a6),%fp0 |
---|
766 | |
---|
767 | bra t_frcinx |
---|
768 | |
---|
769 | EM1SM: |
---|
770 | //--Step 7 |X| < 1/4. |
---|
771 | cmpil #0x3FBE0000,%d0 // ...2^(-65) |
---|
772 | bges EM1POLY |
---|
773 | |
---|
774 | EM1TINY: |
---|
775 | //--Step 8 |X| < 2^(-65) |
---|
776 | cmpil #0x00330000,%d0 // ...2^(-16312) |
---|
777 | blts EM12TINY |
---|
778 | //--Step 8.2 |
---|
779 | movel #0x80010000,SC(%a6) // ...SC is -2^(-16382) |
---|
780 | movel #0x80000000,SC+4(%a6) |
---|
781 | clrl SC+8(%a6) |
---|
782 | fmovex (%a0),%fp0 |
---|
783 | fmovel %d1,%FPCR |
---|
784 | faddx SC(%a6),%fp0 |
---|
785 | |
---|
786 | bra t_frcinx |
---|
787 | |
---|
788 | EM12TINY: |
---|
789 | //--Step 8.3 |
---|
790 | fmovex (%a0),%fp0 |
---|
791 | fmuld TWO140,%fp0 |
---|
792 | movel #0x80010000,SC(%a6) |
---|
793 | movel #0x80000000,SC+4(%a6) |
---|
794 | clrl SC+8(%a6) |
---|
795 | faddx SC(%a6),%fp0 |
---|
796 | fmovel %d1,%FPCR |
---|
797 | fmuld TWON140,%fp0 |
---|
798 | |
---|
799 | bra t_frcinx |
---|
800 | |
---|
801 | EM1POLY: |
---|
802 | //--Step 9 exp(X)-1 by a simple polynomial |
---|
803 | fmovex (%a0),%fp0 // ...fp0 is X |
---|
804 | fmulx %fp0,%fp0 // ...fp0 is S := X*X |
---|
805 | fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2 |
---|
806 | fmoves #0x2F30CAA8,%fp1 // ...fp1 is B12 |
---|
807 | fmulx %fp0,%fp1 // ...fp1 is S*B12 |
---|
808 | fmoves #0x310F8290,%fp2 // ...fp2 is B11 |
---|
809 | fadds #0x32D73220,%fp1 // ...fp1 is B10+S*B12 |
---|
810 | |
---|
811 | fmulx %fp0,%fp2 // ...fp2 is S*B11 |
---|
812 | fmulx %fp0,%fp1 // ...fp1 is S*(B10 + ... |
---|
813 | |
---|
814 | fadds #0x3493F281,%fp2 // ...fp2 is B9+S*... |
---|
815 | faddd EM1B8,%fp1 // ...fp1 is B8+S*... |
---|
816 | |
---|
817 | fmulx %fp0,%fp2 // ...fp2 is S*(B9+... |
---|
818 | fmulx %fp0,%fp1 // ...fp1 is S*(B8+... |
---|
819 | |
---|
820 | faddd EM1B7,%fp2 // ...fp2 is B7+S*... |
---|
821 | faddd EM1B6,%fp1 // ...fp1 is B6+S*... |
---|
822 | |
---|
823 | fmulx %fp0,%fp2 // ...fp2 is S*(B7+... |
---|
824 | fmulx %fp0,%fp1 // ...fp1 is S*(B6+... |
---|
825 | |
---|
826 | faddd EM1B5,%fp2 // ...fp2 is B5+S*... |
---|
827 | faddd EM1B4,%fp1 // ...fp1 is B4+S*... |
---|
828 | |
---|
829 | fmulx %fp0,%fp2 // ...fp2 is S*(B5+... |
---|
830 | fmulx %fp0,%fp1 // ...fp1 is S*(B4+... |
---|
831 | |
---|
832 | faddd EM1B3,%fp2 // ...fp2 is B3+S*... |
---|
833 | faddx EM1B2,%fp1 // ...fp1 is B2+S*... |
---|
834 | |
---|
835 | fmulx %fp0,%fp2 // ...fp2 is S*(B3+... |
---|
836 | fmulx %fp0,%fp1 // ...fp1 is S*(B2+... |
---|
837 | |
---|
838 | fmulx %fp0,%fp2 // ...fp2 is S*S*(B3+...) |
---|
839 | fmulx (%a0),%fp1 // ...fp1 is X*S*(B2... |
---|
840 | |
---|
841 | fmuls #0x3F000000,%fp0 // ...fp0 is S*B1 |
---|
842 | faddx %fp2,%fp1 // ...fp1 is Q |
---|
843 | // ...fp2 released |
---|
844 | |
---|
845 | fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored |
---|
846 | |
---|
847 | faddx %fp1,%fp0 // ...fp0 is S*B1+Q |
---|
848 | // ...fp1 released |
---|
849 | |
---|
850 | fmovel %d1,%FPCR |
---|
851 | faddx (%a0),%fp0 |
---|
852 | |
---|
853 | bra t_frcinx |
---|
854 | |
---|
855 | EM1BIG: |
---|
856 | //--Step 10 |X| > 70 log2 |
---|
857 | movel (%a0),%d0 |
---|
858 | cmpil #0,%d0 |
---|
859 | bgt EXPC1 |
---|
860 | //--Step 10.2 |
---|
861 | fmoves #0xBF800000,%fp0 // ...fp0 is -1 |
---|
862 | fmovel %d1,%FPCR |
---|
863 | fadds #0x00800000,%fp0 // ...-1 + 2^(-126) |
---|
864 | |
---|
865 | bra t_frcinx |
---|
866 | |
---|
867 | |end |
---|