Context Navigation

vec_sup_asm.S @ 8f511f05

4.104.115

Last change on this file since 8f511f05 was 8f511f05, checked in by Till Straumann <strauman@…>, on 12/02/09 at 01:57:45
added SLAC copyright disclaimer.
Property mode set to `100644`
File size: 24.8 KB

Line
1	/* $Id$ */
2	#ifdef __ALTIVEC__
3
4	/* Altivec support for RTEMS; vector register context management. */
5
6	/*
7	* Authorship
8	* ----------
9	* This software was created by
10	* Till Straumann <strauman@slac.stanford.edu>, 2009,
11	* Stanford Linear Accelerator Center, Stanford University.
12	*
13	* Acknowledgement of sponsorship
14	* ------------------------------
15	* This software was produced by
16	* the Stanford Linear Accelerator Center, Stanford University,
17	* under Contract DE-AC03-76SFO0515 with the Department of Energy.
18	*
19	* Government disclaimer of liability
20	* ----------------------------------
21	* Neither the United States nor the United States Department of Energy,
22	* nor any of their employees, makes any warranty, express or implied, or
23	* assumes any legal liability or responsibility for the accuracy,
24	* completeness, or usefulness of any data, apparatus, product, or process
25	* disclosed, or represents that its use would not infringe privately owned
26	* rights.
27	*
28	* Stanford disclaimer of liability
29	* --------------------------------
30	* Stanford University makes no representations or warranties, express or
31	* implied, nor assumes any liability for the use of this software.
32	*
33	* Stanford disclaimer of copyright
34	* --------------------------------
35	* Stanford University, owner of the copyright, hereby disclaims its
36	* copyright and all other rights in this software. Hence, anyone may
37	* freely use it for any purpose without restriction.
38	*
39	* Maintenance of notices
40	* ----------------------
41	* In the interest of clarity regarding the origin and status of this
42	* SLAC software, this and all the preceding Stanford University notices
43	* are to remain affixed to any copy or derivative of this software made
44	* or distributed by the recipient and are to be affixed to any copy of
45	* software made or distributed by the recipient that contains a copy or
46	* derivative of this software.
47	*
48	* ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
49	*/
50
51
52	#include <rtems/powerpc/powerpc.h>
53
54	#ifndef PPC_CACHE_ALIGNMENT
55	#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
56	#endif
57
58	#define ALTIVEC_TESTING
59
60	#if PPC_CACHE_ALIGNMENT != 32
61	#error "Altivec support assumes cache-line size is 32 bytes!"
62	#else
63	#undef LD_PPC_CACHE_ALIGNMENT
64	#define LD_PPC_CACHE_ALIGNMENT 5
65	#endif
66
67	.set v0, 0
68	.set v8, 8
69	.set v16, 16
70	.set v20, 20
71	.set v24, 24
72	.set v28, 28
73
74	.set r0, 0
75	.set r3, 3
76	.set r4, 4
77	.set r5, 5
78	.set r6, 6
79	.set r7, 7
80
81	.set r10, 10
82	.set r11, 11
83	.set r12, 12
84
85	.set cr5, 5
86
87	.set VECSIZE, 16
88
89	.set VRSAVE_INIT_VAL, 0
90	.set VSCR_INIT_VAL, 0
91
92	.set VRSAVE_OFF, 16
93	.set VSCR_OFF, 16+12
94
95	.set ds0, 0
96
97	/* Block size for dst -- in units of 16-bytes */
98	.set BSIZE, 2 /* = 32 bytes */
99	.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
100	.set BSTRIDE, 32 /* bytes */
101
102	.data
103
104	.global _CPU_altivec_vrsave_initval
105	_CPU_altivec_vrsave_initval:
106	.long 0
107
108	.global _CPU_altivec_vscr_initval
109	_CPU_altivec_vscr_initval:
110	.long 0
111
112	.text
113
114	.extern _CPU_altivec_psim_cpu
115	.extern _CPU_altivec_ctxt_off
116
117	.macro CMPOFF _B0
118	lis \_B0, _CPU_altivec_ctxt_off@ha
119	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
120	.endm
121
122	/* Conditionally load or store a vector _VR to
123	* EA(_R1\|0 + _R2)
124	* If bit _VR (corresponding to _VR) is set in CRC
125	* then the load/store is performed but otherwise
126	* it is skipped.
127	* If compiled with IGNORE_VRSAVE defined then
128	* the load/store is done unconditionally.
129	*
130	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
131	* _VR : target vector register
132	* _R1 : base register (NOTE: _R1=r0 uses a
133	* implicit ZERO constant, not the contents
134	* of r0) for address computation.
135	* _R2 : 'offset' register for address computation.
136	*
137	* MODIFIES: _VR on output if a load operation is performed.
138	* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
139	* defined.
140	*/
141	.macro LDST _OPCODE, _VR, _R1, _R2
142	#ifndef IGNORE_VRSAVE
143	bc 4, \_VR, 111f
144	#endif
145	\_OPCODE \_VR, \_R1, \_R2
146	111:
147	.endm
148
149	/*
150	* Load or store four 'adjacent' vector registers.
151	*
152	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
153	* _VR : target vector register
154	* _R1 : base register (NOTE: _R1=r0 uses a
155	* implicit ZERO constant, not the contents
156	* of r0) for address computation.
157	* _B0 : base register 0
158	* _B1 : base register 1
159	* _B2 : base register 2
160	* _B3 : base register 3
161	* _RO : offset register
162	*
163	* memory addresses for _VR, _VR+1, _VR+2, _VR+3
164	* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
165	*
166	* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
167	* operation is performed.
168	* IMPLICIT USE: see LDST
169	*/
170	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
171	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
172	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
173	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
174	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
175	.endm
176
177	/*
178	* Preload/zero two cache lines and save 4 vector registers
179	* to memory.
180	* Note that the cache operation targets memory past the
181	* current storage area which should hopefully hit when
182	* This same code is executed on the next two cache lines...
183	*
184	* This code effectively does
185	* dcbz (_B0 + 64)
186	* dcbz (_B0 + 64 + 32)
187	* stvx _VF+0, (_B0+ 0)
188	* stvx _VF+1, (_B0+16)
189	* stvx _VF+2, (_B0+32)
190	* stvx _VF+3, (_B0+48)
191	*
192	* _LRU: may be 'l' or empty. The former variant should be
193	* used when it is conceivable that the memory area is
194	* unlikely to be used in the near future thus making
195	* it a candidate for early eviction from the caches.
196	*
197	* If it is likely that the memory area is reused soon
198	* (e.g., save/restore across ISR execution) then the
199	* 'stvx' opcode (w/o 'l' suffix) should be used.
200	*
201	* _VR: first of four target vector registers; _VR+0,
202	* _VR+1, _VR+2, _VR+3 are saved.
203	*
204	* _BO: base address of memory area.
205	* _B1: should contain _B0+16 on entry
206	* _B2: should contain _B0+32 on entry
207	* _B3: should contain _B0+48 on entry
208	*
209	* _O1: contains the offset where the four vectors are
210	* stored.
211	* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
212	* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
213	* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
214	* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
215	* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
216	* used to address the two cache-lines past the
217	* current memory area.
218	*
219	* MODIFIES: _O2; contains _O1 + 64 after execution of this
220	* code.
221	*
222	* NOTES: a different set of four vectors can be addressed
223	* simply by changing the one offset register _O1.
224	*
225	* Saving more than 4 registers can simply be
226	* achieved by expanding this macro multiple
227	* times with _O1 and _O2 swapped (new _O1
228	* becomes _O2 = old _O1 + 64) thus stepping
229	* through the memory area.
230	*
231	*/
232	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
233	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
234	dcbz \_B0, \_O2
235	dcbz \_B2, \_O2
236	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
237	.endm
238
239	/*
240	* Save eight vector registers by expanding S4VEC_P twice.
241	* See notes for S4VEC_P above.
242	*
243	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
244	*
245	* MODIFIES: After execution,
246	* _O2 contains original _O1 + 64,
247	* _O1 contains original _O1 + 128
248	*
249	* NOTES: Expanding this macro multiple times lets you save
250	* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
251	*/
252	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
253	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
254	/* Note that the roles of _O1 and _O2 are swapped here */
255	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
256	.endm
257
258	/*
259	* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
260	*
261	* See notes above (for S4VEC_P).
262	*
263	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
264	* MODIFIES: _O1 contains original _O1 + 256
265	* _O2 contains original _O1 + 256 - 64
266	*/
267	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
268	S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269	S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
270	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
271	.endm
272
273	/*
274	* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
275	*
276	* See notes above (for S4VEC_P, S_V0TOV19).
277	*
278	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
279	* MODIFIES: _O1 contains original _O1 + 128
280	* _O2 contains original _O1 + 128 - 64
281	*/
282	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
283	S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
284	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
285	.endm
286
287	/*
288	* Save all registers to memory area
289	*
290	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
291	* MODIFIES: _O1 contains original _O1 + 512
292	* _O2 contains original _O1 + 512 - 64
293	*/
294	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
295	S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296	S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297	S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298	S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
299	LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
300	.endm
301
302
303	/*
304	* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
305	* We can pass either of them as arguments to another macro which
306	* allows us to decide if the main macro uses dcbt or not when
307	* we expand it...
308	*/
309	.macro DO_DCBT _RA, _RB
310	dcbt \_RA, \_RB
311	.endm
312
313	.macro NO_DCBT _RA, _RB
314	.endm
315
316	/*
317	* NOTE REGARDING dcbt VS dst
318	*
319	* Preloading the cache with memory areas that we soon need
320	* can be done either using 'dcbt' or 'dst' instructions
321	* "ahead of time".
322	* When experimenting (on a mpc7457) I found that the 'dst'
323	* stream instruction was very efficient if there is enough
324	* time to read ahead. It works well when we do a context
325	* switch:
326	*
327	* 1) start DST on new context to be loaded
328	* 2) save old context to memory
329	* 3) load new context from memory
330	*
331	* Because of the interleaved step 2) dst works nicely and
332	* 3) finds what it needs in the cache.
333	*
334	* However, in a situation when there is not much time
335	* to start the DST, e.g., because we want to restore
336	* a context out of the blue (e.g., after returning
337	* from and ISR):
338	*
339	* 1) save volatile registers to memory/stack
340	* 2) execute ISR
341	* 3) might do a task context switch
342	* 4) when returned to old task context then
343	* reload volatile registers from memory/stack.
344	*
345	* In this situation, preloading the target memory before
346	* or after step 1) makes obviously no sense because after
347	* 1) the registers area is most likely in the cache already.
348	*
349	* Starting preload after 2) doesn't make much sense either.
350	* If ISR doesn't lead to a context switch then it is quite
351	* likely that the register area is still in the cache.
352	* OTOTH, if a context switch happens then the preload after 2)
353	* might be useless.
354	*
355	* This leaves us at step 4) where we want to load immediately.
356	* In this case, I found that 'dcbt' works more efficiently
357	* so that's what we use when restoring volatile registers.
358	*
359	* When restoring the non-volatile VRs during a 'normal'
360	* context switch then we shall use DST (and no dcbt).
361	*/
362
363	/*
364	* Symmetric to S4VEC_P above but addresses loading four
365	* vector registers from memory.
366	*
367	* Touches two cache lines past the current memory area
368	* and loads four vectors from the current area.
369	*
370	* Optionally, the DCBT operation may be omitted
371	* (when expanding with _DCBT=NO_DCBT).
372	* This is useful if the cache was already preloaded
373	* by another means (dst instruction).
374	*
375	* NOTE: We always use the 'LRU' form of lvx: lvxl,
376	* because we deem it unlikely that the context
377	* that was just loaded has to be saved again
378	* to memory in the immediate future.
379	*
380	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
381	* as explained above.
382	*
383	* MODIFIES: _O2 contains original _O1 + 64.
384	* _VR.._VR+3 loaded from memory.
385	*/
386	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
387	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
388	/* preload/touch 2 lines at offset 64 from _B0 */
389	\_DCBT \_B0, \_O2
390	\_DCBT \_B2, \_O2
391	/* load four vectors at off set 0 from _B0 */
392	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
393	.endm
394
395	/*
396	* Symmetric to S8VEC_P; loads 8 vector registers
397	* from memory -- see comments above...
398	*
399	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
400	* as explained above.
401	*
402	* MODIFIES: _O1 contains original _O1 + 128.
403	* _O2 contains original _O1 + 64.
404	* _VR.._VR+7 loaded from memory.
405	*/
406	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
407	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
409	.endm
410
411	/*
412	* Load volatile vector registers v0..v19 employing
413	* the DCBT to preload the cache. The rationale for
414	* using DCBT here but not when restoring non-volatile
415	* registers is explained above, see
416	*
417	* "NOTE REGARDING dcbt VS dst"
418	*
419	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
420	* as explained above.
421	*
422	* MODIFIES: _O1 contains original _O1 + 256.
423	* _O2 contains original _O1 + 256 - 64.
424	* VR0..VR19 loaded from memory.
425	*/
426	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
427	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
429	LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
430	.endm
431
432	/*
433	* Load non-volatile vector registers v20..v31.
434	* Note that no DCBT is performed since we use
435	* DST for preloading the cache during a context
436	* switch, see
437	*
438	* "NOTE REGARDING dcbt VS dst"
439	*
440	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
441	* as explained above.
442	*
443	* MODIFIES: _O1 contains original _O1 + 128.
444	* _O2 contains original _O1 + 128 - 64.
445	* VR20..VR31 loaded from memory.
446	*/
447	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
448	L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
449	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
450	.endm
451
452	/*
453	* Load all registers from memory area.
454	*/
455	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
456	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458	L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459	L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
460	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
461	.endm
462
463	/*
464	* Compute
465	* _B1 = _B0 + 16
466	* _B2 = _B0 + 32
467	* _B3 = _B0 + 48
468	* and load
469	* _RO = 0
470	*
471	* convenience macro to be expanded before
472	* any of the load/store macros that use
473	* four base addresses etc.
474	*
475	* INPUT: _B0 = cache-aligned start of memory area
476	*
477	* MODIFIES: _B1, _B2, _B3, _RO as described above.
478	*/
479	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
480	addi \_B1, \_B0, 1*VECSIZE
481	addi \_B2, \_B0, 2*VECSIZE
482	addi \_B3, \_B0, 3*VECSIZE
483	li \_RO, 0
484	.endm
485
486	/*
487	* Prepare for saving general vector registers.
488	*
489	* If not built with #define IGNORE_VRSAVE then
490	*
491	* 1) copy vrsave to CRC
492	*
493	* endif
494	*
495	* 2) copy vrsave to _VRSAVE_REG
496	* 3) preload/zero cache line where vrsave and vscr are stored.
497	* 4) compute base adresses from _B0
498	* 5) preload/zero first two cache lines (remember that the
499	* first S8VEC_P starts preloading/zeroing at offset 64).
500	*
501	* INPUT: 'vrsave' register, _B0 (base address of memory area)
502	* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
503	* _B0 = original _BO + 32
504	* _B1 = original _B0 + 32 + 16,
505	* _B2 = original _B0 + 32 + 32,
506	* _B3 = original _B0 + 32 + 48,
507	* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
508	*/
509	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
510	mfvrsave \_VRSAVE_REG
511	#ifndef IGNORE_VRSAVE
512	mtcr \_VRSAVE_REG
513	#endif
514	dcbz 0, \_B0
515	addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
516	dcbz 0, \_B0
517	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
518	dcbz 0, \_B2
519	.endm
520
521	/*
522	* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
523	* must have been loaded from 'vrsave' and 'vscr', respectively,
524	* prior to expanding this macro.
525	*
526	* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
527	* _VSCR_VREG VR holding 'vscr' contents
528	* _B0 cache-aligned (base) address of memory area.
529	* MODIFIES: _SCRATCH_REG
530	*/
531	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
532	stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
533	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
534	stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
535	.endm
536
537	/*
538	* Load 'vrsave' and 'vscr' from memory.
539	*
540	* INPUTS: _B0 cache-aligned (base) address of memory area.
541	* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
542	* 'vscr', 'vrsave'.
543	* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
544	* with IGNORE_VRSAVE undefined).
545	*/
546	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
547	lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
548	mtvrsave \_SCRATCH_REG
549	#ifndef IGNORE_VRSAVE
550	mtcr \_SCRATCH_REG
551	#endif
552	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
553	lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
554	mtvscr \_SCRATCH_VREG
555	.endm
556
557	/*
558	* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
559	*
560	* INPUT: _B0
561	* MODIFIES: _B0 (as stated above)
562	*/
563	.macro CACHE_DOWNALGN _B0
564	rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
565	.endm
566
567	.text
568
569	.global _CPU_save_altivec_volatile
570	_CPU_save_altivec_volatile:
571	/* Align address up to next cache-line boundary */
572	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
573	CACHE_DOWNALGN r3
574
575	#ifndef IGNORE_VRSAVE
576	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
577	* when testing if we really should do the load/store operation.
578	*/
579	mfcr r12
580	#endif
581
582	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
583	/* r0 now contains VRSAVE, r3 still the aligned memory area
584	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
585	* respectively. r10 holds zero
586	*/
587	S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
588	mfvscr v0
589	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
590	S_VSCR_VRSAVE r0, v0, r3, r11
591
592	#ifndef IGNORE_VRSAVE
593	/* Restore CRC */
594	mtcr r12
595	#endif
596	blr
597
598	.global _CPU_load_altivec_volatile
599	_CPU_load_altivec_volatile:
600	/* Align address up to next cache-line boundary */
601	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
602	CACHE_DOWNALGN r3
603	#ifndef IGNORE_VRSAVE
604	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
605	* when testing if we really should do the load/store operation.
606	*/
607	mfcr r12
608	#endif
609
610	/* Try to preload 1st line (where vscr and vrsave are stored) */
611	dcbt 0, r3
612	/* Point to start of general vector-register area */
613	addi r3, r3, PPC_CACHE_ALIGNMENT
614	/* Start preloading 2nd line (where first two vectors are) */
615	dcbt 0, r3
616	L_VSCR_VRSAVE r3, r0, v0
617	CMP_BASES r3, r4, r5, r6, r10
618	/* Start preloading 3rd line (where vectors 3 and 4 are) */
619	dcbt 0, r5
620	L_V0TOV19 r3, r4, r5, r6, r10, r11
621
622	#ifndef IGNORE_VRSAVE
623	mtcr r12
624	#endif
625	blr
626
627	.global _CPU_Context_restore_altivec
628	_CPU_Context_restore_altivec:
629	/* Restore is like 'switch' but we don't have
630	* to save an old context.
631	* Move argument to second arg and load NULL pointer
632	* to first one, then jump to 'switch' routine.
633	*/
634	mr r4, r3
635	li r3, 0
636	b _CPU_Context_switch_altivec
637
638	.global _CPU_Context_switch_altivec
639	_CPU_Context_switch_altivec:
640
641	/* fetch offset of altivec area in context */
642	CMPOFF r5
643	/* down-align 'to' area to cache-line boundary */
644	add r4, r4, r5
645	CACHE_DOWNALGN r4
646
647	/* Check for PSIM */
648	lis r6, _CPU_altivec_psim_cpu@ha
649	lwz r6, _CPU_altivec_psim_cpu@l(r6)
650	cmpli 0, r6, 0
651	bne 1f
652	/* Skip data-stream instructions on PSIM (not implemented) */
653	dssall
654	/* Pre-load new context into cache */
655	lis r6, (BSIZE<<(24-16)) \| (BCNT<<(16-16))
656	ori r6, r6, BSTRIDE
657	dstt r4, r6, ds0
658	1:
659
660	#ifndef IGNORE_VRSAVE
661	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
662	* when testing if we really should do the load/store operation.
663	*/
664	mfcr r12
665	#endif
666
667	/* Is 'from' context == NULL ? (then we just do a 'restore') */
668	cmpli 0, r3, 0
669	beq 1f /* yes: skip saving 'from' context */
670
671	/* SAVE NON-VOLATILE REGISTERS */
672
673	/* Compute aligned destination pointer (r5 still holds offset
674	* to 'altivec' area in context)
675	*/
676	add r3, r3, r5
677	CACHE_DOWNALGN r3
678
679	PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
680	/* The manual says reading vscr can take some time - do
681	* read it here (into a volatile vector register) while
682	* we wait for cache blocks to be allocated
683	*/
684	mfvscr v0
685	S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
686	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
687	S_VSCR_VRSAVE r0, v0, r3, r5
688
689	1:
690
691	/* LOAD NON-VOLATILE REGISTERS */
692
693	/* Advance past vrsave/vscr area */
694	addi r4, r4, PPC_CACHE_ALIGNMENT
695	L_VSCR_VRSAVE r4, r0, v0
696	CMP_BASES r4, r5, r6, r7, r10
697	L_V20TOV31 r4, r5, r6, r7, r10, r11
698
699	#ifndef IGNORE_VRSAVE
700	mtcr r12
701	#endif
702	blr
703
704	.global _CPU_Context_initialize_altivec
705	_CPU_Context_initialize_altivec:
706	CMPOFF r5
707	add r3, r3, r5
708	CACHE_DOWNALGN r3
709	lis r5, _CPU_altivec_vrsave_initval@ha
710	lwz r5, _CPU_altivec_vrsave_initval@l(r5)
711	stw r5, VRSAVE_OFF(r3)
712	lis r6, _CPU_altivec_vscr_initval@ha
713	lwz r6, _CPU_altivec_vscr_initval@l(r6)
714	stw r6, VSCR_OFF(r3)
715	blr
716
717	/*
718	* Change the initial value of VRSAVE.
719	* Can be used by initialization code if
720	* it is determined that code was compiled
721	* with -mvrsave=no. In this case, VRSAVE
722	* must be set to all-ones which causes this
723	* support code to save/restore all registers
724	* (only has an effect if IGNORE_VRSAVE is
725	* not defined -- otherwise all registers are
726	* saved/restored anyways).
727	*/
728	.global _CPU_altivec_set_vrsave_initval
729	_CPU_altivec_set_vrsave_initval:
730	lis r5, _CPU_altivec_vrsave_initval@ha
731	stw r3, _CPU_altivec_vrsave_initval@l(r5)
732	mtvrsave r3
733	blr
734
735	#ifdef ALTIVEC_TESTING
736	.global msr_VE_on
737	msr_VE_on:
738	mfmsr r3
739	oris r3, r3, 1<<(31-6-16)
740	mtmsr r3
741	blr
742
743	.global msr_VE_off
744	msr_VE_off:
745	mfmsr r3
746	lis r4, 1<<(31-6-16)
747	andc r3, r3, r4
748	mtmsr r3
749	blr
750
751
752	.global mfvrsave
753	mfvrsave:
754	mfvrsave r3
755	blr
756
757	.global mtvrsave
758	mtvrsave:
759	mtvrsave r3
760	blr
761
762	/* Load all vector registers from memory area.
763	* NOTE: This routine is not strictly ABI compliant --
764	* it guarantees that volatile vector registers
765	* have certain values on exit!
766	*/
767	.global _CPU_altivec_load_all
768	_CPU_altivec_load_all:
769	/* Align address up to next cache-line boundary */
770	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
771	CACHE_DOWNALGN r3
772	#ifndef IGNORE_VRSAVE
773	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
774	* when testing if we really should do the load/store operation.
775	*/
776	mfcr r12
777	#endif
778
779	/* Try to preload 1st line (where vscr and vrsave are stored) */
780	dcbt 0, r3
781	/* Point to start of general vector-register area */
782	addi r3, r3, PPC_CACHE_ALIGNMENT
783	/* Start preloading 2nd line (where first two vectors are) */
784	dcbt 0, r3
785	L_VSCR_VRSAVE r3, r0, v0
786	CMP_BASES r3, r4, r5, r6, r10
787	/* Start preloading 3rd line (where vectors 3 and 4 are) */
788	dcbt 0, r5
789	L_V0TOV31 r3, r4, r5, r6, r10, r11
790
791	#ifndef IGNORE_VRSAVE
792	mtcr r12
793	#endif
794	blr
795
796	.global _CPU_altivec_save_all
797	_CPU_altivec_save_all:
798	/* Align address up to next cache-line boundary */
799	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
800	CACHE_DOWNALGN r3
801
802	#ifndef IGNORE_VRSAVE
803	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
804	* when testing if we really should do the load/store operation.
805	*/
806	mfcr r12
807	#endif
808
809	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
810	/* r0 now contains VRSAVE, r3 still the aligned memory area
811	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
812	* respectively. r10 holds zero
813	*/
814	S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
815	mfvscr v0
816	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
817	S_VSCR_VRSAVE r0, v0, r3, r11
818
819	#ifndef IGNORE_VRSAVE
820	/* Restore CRC */
821	mtcr r12
822	#endif
823	blr
824
825
826	#if 0
827	.gnu_attribute 4,1
828	.gnu_attribute 8,1
829	#endif
830
831	#endif
832	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format