Context Navigation

setox.S @ b82a4b4

Last change on this file since b82a4b4 was 3cf2bf63, checked in by Sebastian Huber <sebastian.huber@…>, on 03/26/18 at 10:17:06

bsps/m68k: Move fpsp support to bsps

This patch is a part of the BSP source reorganization.

Update #3285.

Property mode set to 100644

File size: 28.4 KB

Line
1	#include "fpsp-namespace.h"
2	//
3	//
4	// setox.sa 3.1 12/10/90
5	//
6	// The entry point setox computes the exponential of a value.
7	// setoxd does the same except the input value is a denormalized
8	// number. setoxm1 computes exp(X)-1, and setoxm1d computes
9	// exp(X)-1 for denormalized X.
10	//
11	// INPUT
12	// -----
13	// Double-extended value in memory location pointed to by address
14	// register a0.
15	//
16	// OUTPUT
17	// ------
18	// exp(X) or exp(X)-1 returned in floating-point register fp0.
19	//
20	// ACCURACY and MONOTONICITY
21	// -------------------------
22	// The returned result is within 0.85 ulps in 64 significant bit, i.e.
23	// within 0.5001 ulp to 53 bits if the result is subsequently rounded
24	// to double precision. The result is provably monotonic in double
25	// precision.
26	//
27	// SPEED
28	// -----
29	// Two timings are measured, both in the copy-back mode. The
30	// first one is measured when the function is invoked the first time
31	// (so the instructions and data are not in cache), and the
32	// second one is measured when the function is reinvoked at the same
33	// input argument.
34	//
35	// The program setox takes approximately 210/190 cycles for input
36	// argument X whose magnitude is less than 16380 log2, which
37	// is the usual situation. For the less common arguments,
38	// depending on their values, the program may run faster or slower --
39	// but no worse than 10% slower even in the extreme cases.
40	//
41	// The program setoxm1 takes approximately ???/??? cycles for input
42	// argument X, 0.25 <= \|X\| < 70log2. For \|X\| < 0.25, it takes
43	// approximately ???/??? cycles. For the less common arguments,
44	// depending on their values, the program may run faster or slower --
45	// but no worse than 10% slower even in the extreme cases.
46	//
47	// ALGORITHM and IMPLEMENTATION NOTES
48	// ----------------------------------
49	//
50	// setoxd
51	// ------
52	// Step 1. Set ans := 1.0
53	//
54	// Step 2. Return ans := ans + sign(X)*2^(-126). Exit.
55	// Notes: This will always generate one exception -- inexact.
56	//
57	//
58	// setox
59	// -----
60	//
61	// Step 1. Filter out extreme cases of input argument.
62	// 1.1 If \|X\| >= 2^(-65), go to Step 1.3.
63	// 1.2 Go to Step 7.
64	// 1.3 If \|X\| < 16380 log(2), go to Step 2.
65	// 1.4 Go to Step 8.
66	// Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
67	// To avoid the use of floating-point comparisons, a
68	// compact representation of \|X\| is used. This format is a
69	// 32-bit integer, the upper (more significant) 16 bits are
70	// the sign and biased exponent field of \|X\|; the lower 16
71	// bits are the 16 most significant fraction (including the
72	// explicit bit) bits of \|X\|. Consequently, the comparisons
73	// in Steps 1.1 and 1.3 can be performed by integer comparison.
74	// Note also that the constant 16380 log(2) used in Step 1.3
75	// is also in the compact form. Thus taking the branch
76	// to Step 2 guarantees \|X\| < 16380 log(2). There is no harm
77	// to have a small number of cases where \|X\| is less than,
78	// but close to, 16380 log(2) and the branch to Step 9 is
79	// taken.
80	//
81	// Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
82	// 2.1 Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
83	// 2.2 N := round-to-nearest-integer( X * 64/log2 ).
84	// 2.3 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
85	// 2.4 Calculate M = (N - J)/64; so N = 64M + J.
86	// 2.5 Calculate the address of the stored value of 2^(J/64).
87	// 2.6 Create the value Scale = 2^M.
88	// Notes: The calculation in 2.2 is really performed by
89	//
90	// Z := X * constant
91	// N := round-to-nearest-integer(Z)
92	//
93	// where
94	//
95	// constant := single-precision( 64/log 2 ).
96	//
97	// Using a single-precision constant avoids memory access.
98	// Another effect of using a single-precision "constant" is
99	// that the calculated value Z is
100	//
101	// Z = X(64/log2)(1+eps), \|eps\| <= 2^(-24).
102	//
103	// This error has to be considered later in Steps 3 and 4.
104	//
105	// Step 3. Calculate X - N*log2/64.
106	// 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
107	// 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
108	// Notes: a) The way L1 and L2 are chosen ensures L1+L2 approximate
109	// the value -log2/64 to 88 bits of accuracy.
110	// b) N*L1 is exact because N is no longer than 22 bits and
111	// L1 is no longer than 24 bits.
112	// c) The calculation X+N*L1 is also exact due to cancellation.
113	// Thus, R is practically X+N(L1+L2) to full 64 bits.
114	// d) It is important to estimate how large can \|R\| be after
115	// Step 3.2.
116	//
117	// N = rnd-to-int( X*64/log2 (1+eps) ), \|eps\|<=2^(-24)
118	// X*64/log2 (1+eps) = N + f, \|f\| <= 0.5
119	// X64/log2 - N = f - epsX 64/log2
120	// X - Nlog2/64 = flog2/64 - eps*X
121	//
122	//
123	// Now \|X\| <= 16446 log2, thus
124	//
125	// \|X - Nlog2/64\| <= (0.5 + 16446/2^(18))log2/64
126	// <= 0.57 log2/64.
127	// This bound will be used in Step 4.
128	//
129	// Step 4. Approximate exp(R)-1 by a polynomial
130	// p = R + RR(A1 + R(A2 + R(A3 + R(A4 + RA5))))
131	// Notes: a) In order to reduce memory access, the coefficients are
132	// made as "short" as possible: A1 (which is 1/2), A4 and A5
133	// are single precision; A2 and A3 are double precision.
134	// b) Even with the restrictions above,
135	// \|p - (exp(R)-1)\| < 2^(-68.8) for all \|R\| <= 0.0062.
136	// Note that 0.0062 is slightly bigger than 0.57 log2/64.
137	// c) To fully utilize the pipeline, p is separated into
138	// two independent pieces of roughly equal complexities
139	// p = [ R + RS(A2 + S*A4) ] +
140	// [ S(A1 + S(A3 + S*A5)) ]
141	// where S = R*R.
142	//
143	// Step 5. Compute 2^(J/64)exp(R) = 2^(J/64)(1+p) by
144	// ans := T + ( T*p + t)
145	// where T and t are the stored values for 2^(J/64).
146	// Notes: 2^(J/64) is stored as T and t where T+t approximates
147	// 2^(J/64) to roughly 85 bits; T is in extended precision
148	// and t is in single precision. Note also that T is rounded
149	// to 62 bits so that the last two bits of T are zero. The
150	// reason for such a special form is that T-1, T-2, and T-8
151	// will all be exact --- a property that will give much
152	// more accurate computation of the function EXPM1.
153	//
154	// Step 6. Reconstruction of exp(X)
155	// exp(X) = 2^M * 2^(J/64) * exp(R).
156	// 6.1 If AdjFlag = 0, go to 6.3
157	// 6.2 ans := ans * AdjScale
158	// 6.3 Restore the user FPCR
159	// 6.4 Return ans := ans * Scale. Exit.
160	// Notes: If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
161	// \|M\| <= 16380, and Scale = 2^M. Moreover, exp(X) will
162	// neither overflow nor underflow. If AdjFlag = 1, that
163	// means that
164	// X = (M1+M)log2 + Jlog2/64 + R, \|M1+M\| >= 16380.
165	// Hence, exp(X) may overflow or underflow or neither.
166	// When that is the case, AdjScale = 2^(M1) where M1 is
167	// approximately M. Thus 6.2 will never cause over/underflow.
168	// Possible exception in 6.4 is overflow or underflow.
169	// The inexact exception is not generated in 6.4. Although
170	// one can argue that the inexact flag should always be
171	// raised, to simulate that exception cost to much than the
172	// flag is worth in practical uses.
173	//
174	// Step 7. Return 1 + X.
175	// 7.1 ans := X
176	// 7.2 Restore user FPCR.
177	// 7.3 Return ans := 1 + ans. Exit
178	// Notes: For non-zero X, the inexact exception will always be
179	// raised by 7.3. That is the only exception raised by 7.3.
180	// Note also that we use the FMOVEM instruction to move X
181	// in Step 7.1 to avoid unnecessary trapping. (Although
182	// the FMOVEM may not seem relevant since X is normalized,
183	// the precaution will be useful in the library version of
184	// this code where the separate entry for denormalized inputs
185	// will be done away with.)
186	//
187	// Step 8. Handle exp(X) where \|X\| >= 16380log2.
188	// 8.1 If \|X\| > 16480 log2, go to Step 9.
189	// (mimic 2.2 - 2.6)
190	// 8.2 N := round-to-integer( X * 64/log2 )
191	// 8.3 Calculate J = N mod 64, J = 0,1,...,63
192	// 8.4 K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
193	// 8.5 Calculate the address of the stored value 2^(J/64).
194	// 8.6 Create the values Scale = 2^M, AdjScale = 2^M1.
195	// 8.7 Go to Step 3.
196	// Notes: Refer to notes for 2.2 - 2.6.
197	//
198	// Step 9. Handle exp(X), \|X\| > 16480 log2.
199	// 9.1 If X < 0, go to 9.3
200	// 9.2 ans := Huge, go to 9.4
201	// 9.3 ans := Tiny.
202	// 9.4 Restore user FPCR.
203	// 9.5 Return ans := ans * ans. Exit.
204	// Notes: Exp(X) will surely overflow or underflow, depending on
205	// X's sign. "Huge" and "Tiny" are respectively large/tiny
206	// extended-precision numbers whose square over/underflow
207	// with an inexact result. Thus, 9.5 always raises the
208	// inexact together with either overflow or underflow.
209	//
210	//
211	// setoxm1d
212	// --------
213	//
214	// Step 1. Set ans := 0
215	//
216	// Step 2. Return ans := X + ans. Exit.
217	// Notes: This will return X with the appropriate rounding
218	// precision prescribed by the user FPCR.
219	//
220	// setoxm1
221	// -------
222	//
223	// Step 1. Check \|X\|
224	// 1.1 If \|X\| >= 1/4, go to Step 1.3.
225	// 1.2 Go to Step 7.
226	// 1.3 If \|X\| < 70 log(2), go to Step 2.
227	// 1.4 Go to Step 10.
228	// Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
229	// However, it is conceivable \|X\| can be small very often
230	// because EXPM1 is intended to evaluate exp(X)-1 accurately
231	// when \|X\| is small. For further details on the comparisons,
232	// see the notes on Step 1 of setox.
233	//
234	// Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
235	// 2.1 N := round-to-nearest-integer( X * 64/log2 ).
236	// 2.2 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
237	// 2.3 Calculate M = (N - J)/64; so N = 64M + J.
238	// 2.4 Calculate the address of the stored value of 2^(J/64).
239	// 2.5 Create the values Sc = 2^M and OnebySc := -2^(-M).
240	// Notes: See the notes on Step 2 of setox.
241	//
242	// Step 3. Calculate X - N*log2/64.
243	// 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
244	// 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
245	// Notes: Applying the analysis of Step 3 of setox in this case
246	// shows that \|R\| <= 0.0055 (note that \|X\| <= 70 log2 in
247	// this case).
248	//
249	// Step 4. Approximate exp(R)-1 by a polynomial
250	// p = R+RR(A1+R(A2+R(A3+R(A4+R(A5+R*A6)))))
251	// Notes: a) In order to reduce memory access, the coefficients are
252	// made as "short" as possible: A1 (which is 1/2), A5 and A6
253	// are single precision; A2, A3 and A4 are double precision.
254	// b) Even with the restriction above,
255	// \|p - (exp(R)-1)\| < \|R\| * 2^(-72.7)
256	// for all \|R\| <= 0.0055.
257	// c) To fully utilize the pipeline, p is separated into
258	// two independent pieces of roughly equal complexity
259	// p = [ RS(A2 + S(A4 + SA6)) ] +
260	// [ R + S(A1 + S(A3 + S*A5)) ]
261	// where S = R*R.
262	//
263	// Step 5. Compute 2^(J/64)*p by
264	// p := T*p
265	// where T and t are the stored values for 2^(J/64).
266	// Notes: 2^(J/64) is stored as T and t where T+t approximates
267	// 2^(J/64) to roughly 85 bits; T is in extended precision
268	// and t is in single precision. Note also that T is rounded
269	// to 62 bits so that the last two bits of T are zero. The
270	// reason for such a special form is that T-1, T-2, and T-8
271	// will all be exact --- a property that will be exploited
272	// in Step 6 below. The total relative error in p is no
273	// bigger than 2^(-67.7) compared to the final result.
274	//
275	// Step 6. Reconstruction of exp(X)-1
276	// exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
277	// 6.1 If M <= 63, go to Step 6.3.
278	// 6.2 ans := T + (p + (t + OnebySc)). Go to 6.6
279	// 6.3 If M >= -3, go to 6.5.
280	// 6.4 ans := (T + (p + t)) + OnebySc. Go to 6.6
281	// 6.5 ans := (T + OnebySc) + (p + t).
282	// 6.6 Restore user FPCR.
283	// 6.7 Return ans := Sc * ans. Exit.
284	// Notes: The various arrangements of the expressions give accurate
285	// evaluations.
286	//
287	// Step 7. exp(X)-1 for \|X\| < 1/4.
288	// 7.1 If \|X\| >= 2^(-65), go to Step 9.
289	// 7.2 Go to Step 8.
290	//
291	// Step 8. Calculate exp(X)-1, \|X\| < 2^(-65).
292	// 8.1 If \|X\| < 2^(-16312), goto 8.3
293	// 8.2 Restore FPCR; return ans := X - 2^(-16382). Exit.
294	// 8.3 X := X * 2^(140).
295	// 8.4 Restore FPCR; ans := ans - 2^(-16382).
296	// Return ans := ans*2^(140). Exit
297	// Notes: The idea is to return "X - tiny" under the user
298	// precision and rounding modes. To avoid unnecessary
299	// inefficiency, we stay away from denormalized numbers the
300	// best we can. For \|X\| >= 2^(-16312), the straightforward
301	// 8.2 generates the inexact exception as the case warrants.
302	//
303	// Step 9. Calculate exp(X)-1, \|X\| < 1/4, by a polynomial
304	// p = X + XX(B1 + X(B2 + ... + XB12))
305	// Notes: a) In order to reduce memory access, the coefficients are
306	// made as "short" as possible: B1 (which is 1/2), B9 to B12
307	// are single precision; B3 to B8 are double precision; and
308	// B2 is double extended.
309	// b) Even with the restriction above,
310	// \|p - (exp(X)-1)\| < \|X\| 2^(-70.6)
311	// for all \|X\| <= 0.251.
312	// Note that 0.251 is slightly bigger than 1/4.
313	// c) To fully preserve accuracy, the polynomial is computed
314	// as X + ( SB1 + Q ) where S = XX and
315	// Q = XS(B2 + X(B3 + ... + XB12))
316	// d) To fully utilize the pipeline, Q is separated into
317	// two independent pieces of roughly equal complexity
318	// Q = [ XS(B2 + S(B4 + ... + SB12)) ] +
319	// [ SS(B3 + S(B5 + ... + SB11)) ]
320	//
321	// Step 10. Calculate exp(X)-1 for \|X\| >= 70 log 2.
322	// 10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
323	// purposes. Therefore, go to Step 1 of setox.
324	// 10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
325	// ans := -1
326	// Restore user FPCR
327	// Return ans := ans + 2^(-126). Exit.
328	// Notes: 10.2 will always create an inexact and return -1 + tiny
329	// in the user rounding precision and mode.
330	//
331	//
332
333	// Copyright (C) Motorola, Inc. 1990
334	// All Rights Reserved
335	//
336	// THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
337	// The copyright notice above does not evidence any
338	// actual or intended publication of such source code.
339
340	//setox idnt 2,1 \| Motorola 040 Floating Point Software Package
341
342	\|section 8
343
344	#include "fpsp.defs"
345
346	L2: .long 0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
347
348	EXPA3: .long 0x3FA55555,0x55554431
349	EXPA2: .long 0x3FC55555,0x55554018
350
351	HUGE: .long 0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
352	TINY: .long 0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
353
354	EM1A4: .long 0x3F811111,0x11174385
355	EM1A3: .long 0x3FA55555,0x55554F5A
356
357	EM1A2: .long 0x3FC55555,0x55555555,0x00000000,0x00000000
358
359	EM1B8: .long 0x3EC71DE3,0xA5774682
360	EM1B7: .long 0x3EFA01A0,0x19D7CB68
361
362	EM1B6: .long 0x3F2A01A0,0x1A019DF3
363	EM1B5: .long 0x3F56C16C,0x16C170E2
364
365	EM1B4: .long 0x3F811111,0x11111111
366	EM1B3: .long 0x3FA55555,0x55555555
367
368	EM1B2: .long 0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
369	.long 0x00000000
370
371	TWO140: .long 0x48B00000,0x00000000
372	TWON140: .long 0x37300000,0x00000000
373
374	EXPTBL:
375	.long 0x3FFF0000,0x80000000,0x00000000,0x00000000
376	.long 0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
377	.long 0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
378	.long 0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
379	.long 0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
380	.long 0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
381	.long 0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
382	.long 0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
383	.long 0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
384	.long 0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
385	.long 0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
386	.long 0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
387	.long 0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
388	.long 0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
389	.long 0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
390	.long 0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
391	.long 0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
392	.long 0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
393	.long 0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
394	.long 0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
395	.long 0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
396	.long 0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
397	.long 0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
398	.long 0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
399	.long 0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
400	.long 0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
401	.long 0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
402	.long 0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
403	.long 0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
404	.long 0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
405	.long 0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
406	.long 0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
407	.long 0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
408	.long 0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
409	.long 0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
410	.long 0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
411	.long 0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
412	.long 0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
413	.long 0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
414	.long 0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
415	.long 0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
416	.long 0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
417	.long 0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
418	.long 0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
419	.long 0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
420	.long 0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
421	.long 0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
422	.long 0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
423	.long 0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
424	.long 0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
425	.long 0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
426	.long 0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
427	.long 0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
428	.long 0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
429	.long 0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
430	.long 0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
431	.long 0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
432	.long 0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
433	.long 0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
434	.long 0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
435	.long 0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
436	.long 0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
437	.long 0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
438	.long 0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
439
440	.set ADJFLAG,L_SCR2
441	.set SCALE,FP_SCR1
442	.set ADJSCALE,FP_SCR2
443	.set SC,FP_SCR3
444	.set ONEBYSC,FP_SCR4
445
446	\| xref t_frcinx
447	\|xref t_extdnrm
448	\|xref t_unfl
449	\|xref t_ovfl
450
451	.global setoxd
452	setoxd:
453	//--entry point for EXP(X), X is denormalized
454	movel (%a0),%d0
455	andil #0x80000000,%d0
456	oril #0x00800000,%d0 // ...sign(X)*2^(-126)
457	movel %d0,-(%sp)
458	fmoves #0x3F800000,%fp0
459	fmovel %d1,%fpcr
460	fadds (%sp)+,%fp0
461	bra t_frcinx
462
463	.global setox
464	setox:
465	//--entry point for EXP(X), here X is finite, non-zero, and not NaN's
466
467	//--Step 1.
468	movel (%a0),%d0 // ...load part of input X
469	andil #0x7FFF0000,%d0 // ...biased expo. of X
470	cmpil #0x3FBE0000,%d0 // ...2^(-65)
471	bges EXPC1 // ...normal case
472	bra EXPSM
473
474	EXPC1:
475	//--The case \|X\| >= 2^(-65)
476	movew 4(%a0),%d0 // ...expo. and partial sig. of \|X\|
477	cmpil #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits
478	blts EXPMAIN // ...normal case
479	bra EXPBIG
480
481	EXPMAIN:
482	//--Step 2.
483	//--This is the normal branch: 2^(-65) <= \|X\| < 16380 log2.
484	fmovex (%a0),%fp0 // ...load input from (a0)
485
486	fmovex %fp0,%fp1
487	fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
488	fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
489	movel #0,ADJFLAG(%a6)
490	fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
491	lea EXPTBL,%a1
492	fmovel %d0,%fp0 // ...convert to floating-format
493
494	movel %d0,L_SCR1(%a6) // ...save N temporarily
495	andil #0x3F,%d0 // ...D0 is J = N mod 64
496	lsll #4,%d0
497	addal %d0,%a1 // ...address of 2^(J/64)
498	movel L_SCR1(%a6),%d0
499	asrl #6,%d0 // ...D0 is M
500	addiw #0x3FFF,%d0 // ...biased expo. of 2^(M)
501	movew L2,L_SCR1(%a6) // ...prefetch L2, no need in CB
502
503	EXPCONT1:
504	//--Step 3.
505	//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
506	//--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
507	fmovex %fp0,%fp2
508	fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64)
509	fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64
510	faddx %fp1,%fp0 // ...X + N*L1
511	faddx %fp2,%fp0 // ...fp0 is R, reduced arg.
512	// MOVE.W #$3FA5,EXPA3 ...load EXPA3 in cache
513
514	//--Step 4.
515	//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
516	//-- R + RR(A1 + R(A2 + R(A3 + R(A4 + RA5))))
517	//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
518	//--[R+RS(A2+SA4)] + [S(A1+S(A3+SA5))]
519
520	fmovex %fp0,%fp1
521	fmulx %fp1,%fp1 // ...fp1 IS S = R*R
522
523	fmoves #0x3AB60B70,%fp2 // ...fp2 IS A5
524	// MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
525
526	fmulx %fp1,%fp2 // ...fp2 IS S*A5
527	fmovex %fp1,%fp3
528	fmuls #0x3C088895,%fp3 // ...fp3 IS S*A4
529
530	faddd EXPA3,%fp2 // ...fp2 IS A3+S*A5
531	faddd EXPA2,%fp3 // ...fp3 IS A2+S*A4
532
533	fmulx %fp1,%fp2 // ...fp2 IS S(A3+SA5)
534	movew %d0,SCALE(%a6) // ...SCALE is 2^(M) in extended
535	clrw SCALE+2(%a6)
536	movel #0x80000000,SCALE+4(%a6)
537	clrl SCALE+8(%a6)
538
539	fmulx %fp1,%fp3 // ...fp3 IS S(A2+SA4)
540
541	fadds #0x3F000000,%fp2 // ...fp2 IS A1+S(A3+SA5)
542	fmulx %fp0,%fp3 // ...fp3 IS RS(A2+S*A4)
543
544	fmulx %fp1,%fp2 // ...fp2 IS S(A1+S(A3+S*A5))
545	faddx %fp3,%fp0 // ...fp0 IS R+RS(A2+S*A4),
546	// ...fp3 released
547
548	fmovex (%a1)+,%fp1 // ...fp1 is lead. pt. of 2^(J/64)
549	faddx %fp2,%fp0 // ...fp0 is EXP(R) - 1
550	// ...fp2 released
551
552	//--Step 5
553	//--final reconstruction process
554	//--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
555
556	fmulx %fp1,%fp0 // ...2^(J/64)*(Exp(R)-1)
557	fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
558	fadds (%a1),%fp0 // ...accurate 2^(J/64)
559
560	faddx %fp1,%fp0 // ...2^(J/64) + 2^(J/64)*...
561	movel ADJFLAG(%a6),%d0
562
563	//--Step 6
564	tstl %d0
565	beqs NORMAL
566	ADJUST:
567	fmulx ADJSCALE(%a6),%fp0
568	NORMAL:
569	fmovel %d1,%FPCR // ...restore user FPCR
570	fmulx SCALE(%a6),%fp0 // ...multiply 2^(M)
571	bra t_frcinx
572
573	EXPSM:
574	//--Step 7
575	fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized
576	fmovel %d1,%FPCR
577	fadds #0x3F800000,%fp0 // ...1+X in user mode
578	bra t_frcinx
579
580	EXPBIG:
581	//--Step 8
582	cmpil #0x400CB27C,%d0 // ...16480 log2
583	bgts EXP2BIG
584	//--Steps 8.2 -- 8.6
585	fmovex (%a0),%fp0 // ...load input from (a0)
586
587	fmovex %fp0,%fp1
588	fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
589	fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
590	movel #1,ADJFLAG(%a6)
591	fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
592	lea EXPTBL,%a1
593	fmovel %d0,%fp0 // ...convert to floating-format
594	movel %d0,L_SCR1(%a6) // ...save N temporarily
595	andil #0x3F,%d0 // ...D0 is J = N mod 64
596	lsll #4,%d0
597	addal %d0,%a1 // ...address of 2^(J/64)
598	movel L_SCR1(%a6),%d0
599	asrl #6,%d0 // ...D0 is K
600	movel %d0,L_SCR1(%a6) // ...save K temporarily
601	asrl #1,%d0 // ...D0 is M1
602	subl %d0,L_SCR1(%a6) // ...a1 is M
603	addiw #0x3FFF,%d0 // ...biased expo. of 2^(M1)
604	movew %d0,ADJSCALE(%a6) // ...ADJSCALE := 2^(M1)
605	clrw ADJSCALE+2(%a6)
606	movel #0x80000000,ADJSCALE+4(%a6)
607	clrl ADJSCALE+8(%a6)
608	movel L_SCR1(%a6),%d0 // ...D0 is M
609	addiw #0x3FFF,%d0 // ...biased expo. of 2^(M)
610	bra EXPCONT1 // ...go back to Step 3
611
612	EXP2BIG:
613	//--Step 9
614	fmovel %d1,%FPCR
615	movel (%a0),%d0
616	bclrb #sign_bit,(%a0) // ...setox always returns positive
617	cmpil #0,%d0
618	blt t_unfl
619	bra t_ovfl
620
621	.global setoxm1d
622	setoxm1d:
623	//--entry point for EXPM1(X), here X is denormalized
624	//--Step 0.
625	bra t_extdnrm
626
627
628	.global setoxm1
629	setoxm1:
630	//--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
631
632	//--Step 1.
633	//--Step 1.1
634	movel (%a0),%d0 // ...load part of input X
635	andil #0x7FFF0000,%d0 // ...biased expo. of X
636	cmpil #0x3FFD0000,%d0 // ...1/4
637	bges EM1CON1 // ...\|X\| >= 1/4
638	bra EM1SM
639
640	EM1CON1:
641	//--Step 1.3
642	//--The case \|X\| >= 1/4
643	movew 4(%a0),%d0 // ...expo. and partial sig. of \|X\|
644	cmpil #0x4004C215,%d0 // ...70log2 rounded up to 16 bits
645	bles EM1MAIN // ...1/4 <= \|X\| <= 70log2
646	bra EM1BIG
647
648	EM1MAIN:
649	//--Step 2.
650	//--This is the case: 1/4 <= \|X\| <= 70 log2.
651	fmovex (%a0),%fp0 // ...load input from (a0)
652
653	fmovex %fp0,%fp1
654	fmuls #0x42B8AA3B,%fp0 // ...64/log2 * X
655	fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
656	// MOVE.W #$3F81,EM1A4 ...prefetch in CB mode
657	fmovel %fp0,%d0 // ...N = int( X * 64/log2 )
658	lea EXPTBL,%a1
659	fmovel %d0,%fp0 // ...convert to floating-format
660
661	movel %d0,L_SCR1(%a6) // ...save N temporarily
662	andil #0x3F,%d0 // ...D0 is J = N mod 64
663	lsll #4,%d0
664	addal %d0,%a1 // ...address of 2^(J/64)
665	movel L_SCR1(%a6),%d0
666	asrl #6,%d0 // ...D0 is M
667	movel %d0,L_SCR1(%a6) // ...save a copy of M
668	// MOVE.W #$3FDC,L2 ...prefetch L2 in CB mode
669
670	//--Step 3.
671	//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
672	//--a0 points to 2^(J/64), D0 and a1 both contain M
673	fmovex %fp0,%fp2
674	fmuls #0xBC317218,%fp0 // ...N * L1, L1 = lead(-log2/64)
675	fmulx L2,%fp2 // ...N * L2, L1+L2 = -log2/64
676	faddx %fp1,%fp0 // ...X + N*L1
677	faddx %fp2,%fp0 // ...fp0 is R, reduced arg.
678	// MOVE.W #$3FC5,EM1A2 ...load EM1A2 in cache
679	addiw #0x3FFF,%d0 // ...D0 is biased expo. of 2^M
680
681	//--Step 4.
682	//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
683	//-- R + RR(A1 + R(A2 + R(A3 + R(A4 + R(A5 + R*A6)))))
684	//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
685	//--[RS(A2+S(A4+SA6))] + [R+S(A1+S(A3+S*A5))]
686
687	fmovex %fp0,%fp1
688	fmulx %fp1,%fp1 // ...fp1 IS S = R*R
689
690	fmoves #0x3950097B,%fp2 // ...fp2 IS a6
691	// MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
692
693	fmulx %fp1,%fp2 // ...fp2 IS S*A6
694	fmovex %fp1,%fp3
695	fmuls #0x3AB60B6A,%fp3 // ...fp3 IS S*A5
696
697	faddd EM1A4,%fp2 // ...fp2 IS A4+S*A6
698	faddd EM1A3,%fp3 // ...fp3 IS A3+S*A5
699	movew %d0,SC(%a6) // ...SC is 2^(M) in extended
700	clrw SC+2(%a6)
701	movel #0x80000000,SC+4(%a6)
702	clrl SC+8(%a6)
703
704	fmulx %fp1,%fp2 // ...fp2 IS S(A4+SA6)
705	movel L_SCR1(%a6),%d0 // ...D0 is M
706	negw %d0 // ...D0 is -M
707	fmulx %fp1,%fp3 // ...fp3 IS S(A3+SA5)
708	addiw #0x3FFF,%d0 // ...biased expo. of 2^(-M)
709	faddd EM1A2,%fp2 // ...fp2 IS A2+S(A4+SA6)
710	fadds #0x3F000000,%fp3 // ...fp3 IS A1+S(A3+SA5)
711
712	fmulx %fp1,%fp2 // ...fp2 IS S(A2+S(A4+S*A6))
713	oriw #0x8000,%d0 // ...signed/expo. of -2^(-M)
714	movew %d0,ONEBYSC(%a6) // ...OnebySc is -2^(-M)
715	clrw ONEBYSC+2(%a6)
716	movel #0x80000000,ONEBYSC+4(%a6)
717	clrl ONEBYSC+8(%a6)
718	fmulx %fp3,%fp1 // ...fp1 IS S(A1+S(A3+S*A5))
719	// ...fp3 released
720
721	fmulx %fp0,%fp2 // ...fp2 IS RS(A2+S(A4+SA6))
722	faddx %fp1,%fp0 // ...fp0 IS R+S(A1+S(A3+S*A5))
723	// ...fp1 released
724
725	faddx %fp2,%fp0 // ...fp0 IS EXP(R)-1
726	// ...fp2 released
727	fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
728
729	//--Step 5
730	//--Compute 2^(J/64)*p
731
732	fmulx (%a1),%fp0 // ...2^(J/64)*(Exp(R)-1)
733
734	//--Step 6
735	//--Step 6.1
736	movel L_SCR1(%a6),%d0 // ...retrieve M
737	cmpil #63,%d0
738	bles MLE63
739	//--Step 6.2 M >= 64
740	fmoves 12(%a1),%fp1 // ...fp1 is t
741	faddx ONEBYSC(%a6),%fp1 // ...fp1 is t+OnebySc
742	faddx %fp1,%fp0 // ...p+(t+OnebySc), fp1 released
743	faddx (%a1),%fp0 // ...T+(p+(t+OnebySc))
744	bras EM1SCALE
745	MLE63:
746	//--Step 6.3 M <= 63
747	cmpil #-3,%d0
748	bges MGEN3
749	MLTN3:
750	//--Step 6.4 M <= -4
751	fadds 12(%a1),%fp0 // ...p+t
752	faddx (%a1),%fp0 // ...T+(p+t)
753	faddx ONEBYSC(%a6),%fp0 // ...OnebySc + (T+(p+t))
754	bras EM1SCALE
755	MGEN3:
756	//--Step 6.5 -3 <= M <= 63
757	fmovex (%a1)+,%fp1 // ...fp1 is T
758	fadds (%a1),%fp0 // ...fp0 is p+t
759	faddx ONEBYSC(%a6),%fp1 // ...fp1 is T+OnebySc
760	faddx %fp1,%fp0 // ...(T+OnebySc)+(p+t)
761
762	EM1SCALE:
763	//--Step 6.6
764	fmovel %d1,%FPCR
765	fmulx SC(%a6),%fp0
766
767	bra t_frcinx
768
769	EM1SM:
770	//--Step 7 \|X\| < 1/4.
771	cmpil #0x3FBE0000,%d0 // ...2^(-65)
772	bges EM1POLY
773
774	EM1TINY:
775	//--Step 8 \|X\| < 2^(-65)
776	cmpil #0x00330000,%d0 // ...2^(-16312)
777	blts EM12TINY
778	//--Step 8.2
779	movel #0x80010000,SC(%a6) // ...SC is -2^(-16382)
780	movel #0x80000000,SC+4(%a6)
781	clrl SC+8(%a6)
782	fmovex (%a0),%fp0
783	fmovel %d1,%FPCR
784	faddx SC(%a6),%fp0
785
786	bra t_frcinx
787
788	EM12TINY:
789	//--Step 8.3
790	fmovex (%a0),%fp0
791	fmuld TWO140,%fp0
792	movel #0x80010000,SC(%a6)
793	movel #0x80000000,SC+4(%a6)
794	clrl SC+8(%a6)
795	faddx SC(%a6),%fp0
796	fmovel %d1,%FPCR
797	fmuld TWON140,%fp0
798
799	bra t_frcinx
800
801	EM1POLY:
802	//--Step 9 exp(X)-1 by a simple polynomial
803	fmovex (%a0),%fp0 // ...fp0 is X
804	fmulx %fp0,%fp0 // ...fp0 is S := X*X
805	fmovemx %fp2-%fp2/%fp3,-(%a7) // ...save fp2
806	fmoves #0x2F30CAA8,%fp1 // ...fp1 is B12
807	fmulx %fp0,%fp1 // ...fp1 is S*B12
808	fmoves #0x310F8290,%fp2 // ...fp2 is B11
809	fadds #0x32D73220,%fp1 // ...fp1 is B10+S*B12
810
811	fmulx %fp0,%fp2 // ...fp2 is S*B11
812	fmulx %fp0,%fp1 // ...fp1 is S*(B10 + ...
813
814	fadds #0x3493F281,%fp2 // ...fp2 is B9+S*...
815	faddd EM1B8,%fp1 // ...fp1 is B8+S*...
816
817	fmulx %fp0,%fp2 // ...fp2 is S*(B9+...
818	fmulx %fp0,%fp1 // ...fp1 is S*(B8+...
819
820	faddd EM1B7,%fp2 // ...fp2 is B7+S*...
821	faddd EM1B6,%fp1 // ...fp1 is B6+S*...
822
823	fmulx %fp0,%fp2 // ...fp2 is S*(B7+...
824	fmulx %fp0,%fp1 // ...fp1 is S*(B6+...
825
826	faddd EM1B5,%fp2 // ...fp2 is B5+S*...
827	faddd EM1B4,%fp1 // ...fp1 is B4+S*...
828
829	fmulx %fp0,%fp2 // ...fp2 is S*(B5+...
830	fmulx %fp0,%fp1 // ...fp1 is S*(B4+...
831
832	faddd EM1B3,%fp2 // ...fp2 is B3+S*...
833	faddx EM1B2,%fp1 // ...fp1 is B2+S*...
834
835	fmulx %fp0,%fp2 // ...fp2 is S*(B3+...
836	fmulx %fp0,%fp1 // ...fp1 is S*(B2+...
837
838	fmulx %fp0,%fp2 // ...fp2 is SS(B3+...)
839	fmulx (%a0),%fp1 // ...fp1 is XS(B2...
840
841	fmuls #0x3F000000,%fp0 // ...fp0 is S*B1
842	faddx %fp2,%fp1 // ...fp1 is Q
843	// ...fp2 released
844
845	fmovemx (%a7)+,%fp2-%fp2/%fp3 // ...fp2 restored
846
847	faddx %fp1,%fp0 // ...fp0 is S*B1+Q
848	// ...fp1 released
849
850	fmovel %d1,%FPCR
851	faddx (%a0),%fp0
852
853	bra t_frcinx
854
855	EM1BIG:
856	//--Step 10 \|X\| > 70 log2
857	movel (%a0),%d0
858	cmpil #0,%d0
859	bgt EXPC1
860	//--Step 10.2
861	fmoves #0xBF800000,%fp0 // ...fp0 is -1
862	fmovel %d1,%FPCR
863	fadds #0x00800000,%fp0 // ...-1 + 2^(-126)
864
865	bra t_frcinx
866
867	\|end

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format