Context Navigation

vec_sup_asm.S @ fbee4ff

4.104.115

Last change on this file since fbee4ff was fbee4ff, checked in by Till Straumann <strauman@…>, on 12/02/09 at 01:33:51

2009-12-01 Till Straumann <strauman@…>

Makefile.am, mpc6xx/altivec: new directory implementing support for AltiVec? context saving/restoring.

Property mode set to 100644

File size: 22.9 KB

Line
1	#ifdef __ALTIVEC__
2
3	#include <rtems/powerpc/powerpc.h>
4
5	#ifndef PPC_CACHE_ALIGNMENT
6	#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
7	#endif
8
9	#define ALTIVEC_TESTING
10
11	#if PPC_CACHE_ALIGNMENT != 32
12	#error "Altivec support assumes cache-line size is 32 bytes!"
13	#else
14	#undef LD_PPC_CACHE_ALIGNMENT
15	#define LD_PPC_CACHE_ALIGNMENT 5
16	#endif
17
18	.set v0, 0
19	.set v8, 8
20	.set v16, 16
21	.set v20, 20
22	.set v24, 24
23	.set v28, 28
24
25	.set r0, 0
26	.set r3, 3
27	.set r4, 4
28	.set r5, 5
29	.set r6, 6
30	.set r7, 7
31
32	.set r10, 10
33	.set r11, 11
34	.set r12, 12
35
36	.set cr5, 5
37
38	.set VECSIZE, 16
39
40	.set VRSAVE_INIT_VAL, 0
41	.set VSCR_INIT_VAL, 0
42
43	.set VRSAVE_OFF, 16
44	.set VSCR_OFF, 16+12
45
46	.set ds0, 0
47
48	/* Block size for dst -- in units of 16-bytes */
49	.set BSIZE, 2 /* = 32 bytes */
50	.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
51	.set BSTRIDE, 32 /* bytes */
52
53	.data
54
55	.global _CPU_altivec_vrsave_initval
56	_CPU_altivec_vrsave_initval:
57	.long 0
58
59	.global _CPU_altivec_vscr_initval
60	_CPU_altivec_vscr_initval:
61	.long 0
62
63	.text
64
65	.extern _CPU_altivec_psim_cpu
66	.extern _CPU_altivec_ctxt_off
67
68	.macro CMPOFF _B0
69	lis \_B0, _CPU_altivec_ctxt_off@ha
70	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
71	.endm
72
73	/* Conditionally load or store a vector _VR to
74	* EA(_R1\|0 + _R2)
75	* If bit _VR (corresponding to _VR) is set in CRC
76	* then the load/store is performed but otherwise
77	* it is skipped.
78	* If compiled with IGNORE_VRSAVE defined then
79	* the load/store is done unconditionally.
80	*
81	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
82	* _VR : target vector register
83	* _R1 : base register (NOTE: _R1=r0 uses a
84	* implicit ZERO constant, not the contents
85	* of r0) for address computation.
86	* _R2 : 'offset' register for address computation.
87	*
88	* MODIFIES: _VR on output if a load operation is performed.
89	* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
90	* defined.
91	*/
92	.macro LDST _OPCODE, _VR, _R1, _R2
93	#ifndef IGNORE_VRSAVE
94	bc 4, \_VR, 111f
95	#endif
96	\_OPCODE \_VR, \_R1, \_R2
97	111:
98	.endm
99
100	/*
101	* Load or store four 'adjacent' vector registers.
102	*
103	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
104	* _VR : target vector register
105	* _R1 : base register (NOTE: _R1=r0 uses a
106	* implicit ZERO constant, not the contents
107	* of r0) for address computation.
108	* _B0 : base register 0
109	* _B1 : base register 1
110	* _B2 : base register 2
111	* _B3 : base register 3
112	* _RO : offset register
113	*
114	* memory addresses for _VR, _VR+1, _VR+2, _VR+3
115	* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
116	*
117	* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
118	* operation is performed.
119	* IMPLICIT USE: see LDST
120	*/
121	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
122	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
123	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
124	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
125	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
126	.endm
127
128	/*
129	* Preload/zero two cache lines and save 4 vector registers
130	* to memory.
131	* Note that the cache operation targets memory past the
132	* current storage area which should hopefully hit when
133	* This same code is executed on the next two cache lines...
134	*
135	* This code effectively does
136	* dcbz (_B0 + 64)
137	* dcbz (_B0 + 64 + 32)
138	* stvx _VF+0, (_B0+ 0)
139	* stvx _VF+1, (_B0+16)
140	* stvx _VF+2, (_B0+32)
141	* stvx _VF+3, (_B0+48)
142	*
143	* _LRU: may be 'l' or empty. The former variant should be
144	* used when it is conceivable that the memory area is
145	* unlikely to be used in the near future thus making
146	* it a candidate for early eviction from the caches.
147	*
148	* If it is likely that the memory area is reused soon
149	* (e.g., save/restore across ISR execution) then the
150	* 'stvx' opcode (w/o 'l' suffix) should be used.
151	*
152	* _VR: first of four target vector registers; _VR+0,
153	* _VR+1, _VR+2, _VR+3 are saved.
154	*
155	* _BO: base address of memory area.
156	* _B1: should contain _B0+16 on entry
157	* _B2: should contain _B0+32 on entry
158	* _B3: should contain _B0+48 on entry
159	*
160	* _O1: contains the offset where the four vectors are
161	* stored.
162	* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
163	* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
164	* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
165	* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
166	* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
167	* used to address the two cache-lines past the
168	* current memory area.
169	*
170	* MODIFIES: _O2; contains _O1 + 64 after execution of this
171	* code.
172	*
173	* NOTES: a different set of four vectors can be addressed
174	* simply by changing the one offset register _O1.
175	*
176	* Saving more than 4 registers can simply be
177	* achieved by expanding this macro multiple
178	* times with _O1 and _O2 swapped (new _O1
179	* becomes _O2 = old _O1 + 64) thus stepping
180	* through the memory area.
181	*
182	*/
183	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
184	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
185	dcbz \_B0, \_O2
186	dcbz \_B2, \_O2
187	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
188	.endm
189
190	/*
191	* Save eight vector registers by expanding S4VEC_P twice.
192	* See notes for S4VEC_P above.
193	*
194	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
195	*
196	* MODIFIES: After execution,
197	* _O2 contains original _O1 + 64,
198	* _O1 contains original _O1 + 128
199	*
200	* NOTES: Expanding this macro multiple times lets you save
201	* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
202	*/
203	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
204	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
205	/* Note that the roles of _O1 and _O2 are swapped here */
206	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
207	.endm
208
209	/*
210	* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
211	*
212	* See notes above (for S4VEC_P).
213	*
214	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
215	* MODIFIES: _O1 contains original _O1 + 256
216	* _O2 contains original _O1 + 256 - 64
217	*/
218	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
219	S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
220	S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
221	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
222	.endm
223
224	/*
225	* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
226	*
227	* See notes above (for S4VEC_P, S_V0TOV19).
228	*
229	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
230	* MODIFIES: _O1 contains original _O1 + 128
231	* _O2 contains original _O1 + 128 - 64
232	*/
233	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
234	S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
235	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
236	.endm
237
238	/*
239	* Save all registers to memory area
240	*
241	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
242	* MODIFIES: _O1 contains original _O1 + 512
243	* _O2 contains original _O1 + 512 - 64
244	*/
245	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
246	S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
247	S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
248	S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
249	S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
250	LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
251	.endm
252
253
254	/*
255	* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
256	* We can pass either of them as arguments to another macro which
257	* allows us to decide if the main macro uses dcbt or not when
258	* we expand it...
259	*/
260	.macro DO_DCBT _RA, _RB
261	dcbt \_RA, \_RB
262	.endm
263
264	.macro NO_DCBT _RA, _RB
265	.endm
266
267	/*
268	* NOTE REGARDING dcbt VS dst
269	*
270	* Preloading the cache with memory areas that we soon need
271	* can be done either using 'dcbt' or 'dst' instructions
272	* "ahead of time".
273	* When experimenting (on a mpc7457) I found that the 'dst'
274	* stream instruction was very efficient if there is enough
275	* time to read ahead. It works well when we do a context
276	* switch:
277	*
278	* 1) start DST on new context to be loaded
279	* 2) save old context to memory
280	* 3) load new context from memory
281	*
282	* Because of the interleaved step 2) dst works nicely and
283	* 3) finds what it needs in the cache.
284	*
285	* However, in a situation when there is not much time
286	* to start the DST, e.g., because we want to restore
287	* a context out of the blue (e.g., after returning
288	* from and ISR):
289	*
290	* 1) save volatile registers to memory/stack
291	* 2) execute ISR
292	* 3) might do a task context switch
293	* 4) when returned to old task context then
294	* reload volatile registers from memory/stack.
295	*
296	* In this situation, preloading the target memory before
297	* or after step 1) makes obviously no sense because after
298	* 1) the registers area is most likely in the cache already.
299	*
300	* Starting preload after 2) doesn't make much sense either.
301	* If ISR doesn't lead to a context switch then it is quite
302	* likely that the register area is still in the cache.
303	* OTOTH, if a context switch happens then the preload after 2)
304	* might be useless.
305	*
306	* This leaves us at step 4) where we want to load immediately.
307	* In this case, I found that 'dcbt' works more efficiently
308	* so that's what we use when restoring volatile registers.
309	*
310	* When restoring the non-volatile VRs during a 'normal'
311	* context switch then we shall use DST (and no dcbt).
312	*/
313
314	/*
315	* Symmetric to S4VEC_P above but addresses loading four
316	* vector registers from memory.
317	*
318	* Touches two cache lines past the current memory area
319	* and loads four vectors from the current area.
320	*
321	* Optionally, the DCBT operation may be omitted
322	* (when expanding with _DCBT=NO_DCBT).
323	* This is useful if the cache was already preloaded
324	* by another means (dst instruction).
325	*
326	* NOTE: We always use the 'LRU' form of lvx: lvxl,
327	* because we deem it unlikely that the context
328	* that was just loaded has to be saved again
329	* to memory in the immediate future.
330	*
331	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
332	* as explained above.
333	*
334	* MODIFIES: _O2 contains original _O1 + 64.
335	* _VR.._VR+3 loaded from memory.
336	*/
337	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
338	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
339	/* preload/touch 2 lines at offset 64 from _B0 */
340	\_DCBT \_B0, \_O2
341	\_DCBT \_B2, \_O2
342	/* load four vectors at off set 0 from _B0 */
343	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
344	.endm
345
346	/*
347	* Symmetric to S8VEC_P; loads 8 vector registers
348	* from memory -- see comments above...
349	*
350	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
351	* as explained above.
352	*
353	* MODIFIES: _O1 contains original _O1 + 128.
354	* _O2 contains original _O1 + 64.
355	* _VR.._VR+7 loaded from memory.
356	*/
357	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
358	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
359	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
360	.endm
361
362	/*
363	* Load volatile vector registers v0..v19 employing
364	* the DCBT to preload the cache. The rationale for
365	* using DCBT here but not when restoring non-volatile
366	* registers is explained above, see
367	*
368	* "NOTE REGARDING dcbt VS dst"
369	*
370	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
371	* as explained above.
372	*
373	* MODIFIES: _O1 contains original _O1 + 256.
374	* _O2 contains original _O1 + 256 - 64.
375	* VR0..VR19 loaded from memory.
376	*/
377	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
378	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
379	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
380	LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
381	.endm
382
383	/*
384	* Load non-volatile vector registers v20..v31.
385	* Note that no DCBT is performed since we use
386	* DST for preloading the cache during a context
387	* switch, see
388	*
389	* "NOTE REGARDING dcbt VS dst"
390	*
391	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
392	* as explained above.
393	*
394	* MODIFIES: _O1 contains original _O1 + 128.
395	* _O2 contains original _O1 + 128 - 64.
396	* VR20..VR31 loaded from memory.
397	*/
398	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
399	L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
400	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
401	.endm
402
403	/*
404	* Load all registers from memory area.
405	*/
406	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
407	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
409	L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
410	L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
411	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
412	.endm
413
414	/*
415	* Compute
416	* _B1 = _B0 + 16
417	* _B2 = _B0 + 32
418	* _B3 = _B0 + 48
419	* and load
420	* _RO = 0
421	*
422	* convenience macro to be expanded before
423	* any of the load/store macros that use
424	* four base addresses etc.
425	*
426	* INPUT: _B0 = cache-aligned start of memory area
427	*
428	* MODIFIES: _B1, _B2, _B3, _RO as described above.
429	*/
430	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
431	addi \_B1, \_B0, 1*VECSIZE
432	addi \_B2, \_B0, 2*VECSIZE
433	addi \_B3, \_B0, 3*VECSIZE
434	li \_RO, 0
435	.endm
436
437	/*
438	* Prepare for saving general vector registers.
439	*
440	* If not built with #define IGNORE_VRSAVE then
441	*
442	* 1) copy vrsave to CRC
443	*
444	* endif
445	*
446	* 2) copy vrsave to _VRSAVE_REG
447	* 3) preload/zero cache line where vrsave and vscr are stored.
448	* 4) compute base adresses from _B0
449	* 5) preload/zero first two cache lines (remember that the
450	* first S8VEC_P starts preloading/zeroing at offset 64).
451	*
452	* INPUT: 'vrsave' register, _B0 (base address of memory area)
453	* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
454	* _B0 = original _BO + 32
455	* _B1 = original _B0 + 32 + 16,
456	* _B2 = original _B0 + 32 + 32,
457	* _B3 = original _B0 + 32 + 48,
458	* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
459	*/
460	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
461	mfvrsave \_VRSAVE_REG
462	#ifndef IGNORE_VRSAVE
463	mtcr \_VRSAVE_REG
464	#endif
465	dcbz 0, \_B0
466	addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
467	dcbz 0, \_B0
468	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
469	dcbz 0, \_B2
470	.endm
471
472	/*
473	* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
474	* must have been loaded from 'vrsave' and 'vscr', respectively,
475	* prior to expanding this macro.
476	*
477	* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
478	* _VSCR_VREG VR holding 'vscr' contents
479	* _B0 cache-aligned (base) address of memory area.
480	* MODIFIES: _SCRATCH_REG
481	*/
482	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
483	stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
484	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
485	stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
486	.endm
487
488	/*
489	* Load 'vrsave' and 'vscr' from memory.
490	*
491	* INPUTS: _B0 cache-aligned (base) address of memory area.
492	* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
493	* 'vscr', 'vrsave'.
494	* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
495	* with IGNORE_VRSAVE undefined).
496	*/
497	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
498	lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
499	mtvrsave \_SCRATCH_REG
500	#ifndef IGNORE_VRSAVE
501	mtcr \_SCRATCH_REG
502	#endif
503	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
504	lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
505	mtvscr \_SCRATCH_VREG
506	.endm
507
508	/*
509	* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
510	*
511	* INPUT: _B0
512	* MODIFIES: _B0 (as stated above)
513	*/
514	.macro CACHE_DOWNALGN _B0
515	rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
516	.endm
517
518	.text
519
520	.global _CPU_save_altivec_volatile
521	_CPU_save_altivec_volatile:
522	/* Align address up to next cache-line boundary */
523	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
524	CACHE_DOWNALGN r3
525
526	#ifndef IGNORE_VRSAVE
527	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
528	* when testing if we really should do the load/store operation.
529	*/
530	mfcr r12
531	#endif
532
533	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
534	/* r0 now contains VRSAVE, r3 still the aligned memory area
535	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
536	* respectively. r10 holds zero
537	*/
538	S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
539	mfvscr v0
540	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
541	S_VSCR_VRSAVE r0, v0, r3, r11
542
543	#ifndef IGNORE_VRSAVE
544	/* Restore CRC */
545	mtcr r12
546	#endif
547	blr
548
549	.global _CPU_load_altivec_volatile
550	_CPU_load_altivec_volatile:
551	/* Align address up to next cache-line boundary */
552	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
553	CACHE_DOWNALGN r3
554	#ifndef IGNORE_VRSAVE
555	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
556	* when testing if we really should do the load/store operation.
557	*/
558	mfcr r12
559	#endif
560
561	/* Try to preload 1st line (where vscr and vrsave are stored) */
562	dcbt 0, r3
563	/* Point to start of general vector-register area */
564	addi r3, r3, PPC_CACHE_ALIGNMENT
565	/* Start preloading 2nd line (where first two vectors are) */
566	dcbt 0, r3
567	L_VSCR_VRSAVE r3, r0, v0
568	CMP_BASES r3, r4, r5, r6, r10
569	/* Start preloading 3rd line (where vectors 3 and 4 are) */
570	dcbt 0, r5
571	L_V0TOV19 r3, r4, r5, r6, r10, r11
572
573	#ifndef IGNORE_VRSAVE
574	mtcr r12
575	#endif
576	blr
577
578	.global _CPU_Context_restore_altivec
579	_CPU_Context_restore_altivec:
580	/* Restore is like 'switch' but we don't have
581	* to save an old context.
582	* Move argument to second arg and load NULL pointer
583	* to first one, then jump to 'switch' routine.
584	*/
585	mr r4, r3
586	li r3, 0
587	b _CPU_Context_switch_altivec
588
589	.global _CPU_Context_switch_altivec
590	_CPU_Context_switch_altivec:
591
592	/* fetch offset of altivec area in context */
593	CMPOFF r5
594	/* down-align 'to' area to cache-line boundary */
595	add r4, r4, r5
596	CACHE_DOWNALGN r4
597
598	/* Check for PSIM */
599	lis r6, _CPU_altivec_psim_cpu@ha
600	lwz r6, _CPU_altivec_psim_cpu@l(r6)
601	cmpli 0, r6, 0
602	bne 1f
603	/* Skip data-stream instructions on PSIM (not implemented) */
604	dssall
605	/* Pre-load new context into cache */
606	lis r6, (BSIZE<<(24-16)) \| (BCNT<<(16-16))
607	ori r6, r6, BSTRIDE
608	dstt r4, r6, ds0
609	1:
610
611	#ifndef IGNORE_VRSAVE
612	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
613	* when testing if we really should do the load/store operation.
614	*/
615	mfcr r12
616	#endif
617
618	/* Is 'from' context == NULL ? (then we just do a 'restore') */
619	cmpli 0, r3, 0
620	beq 1f /* yes: skip saving 'from' context */
621
622	/* SAVE NON-VOLATILE REGISTERS */
623
624	/* Compute aligned destination pointer (r5 still holds offset
625	* to 'altivec' area in context)
626	*/
627	add r3, r3, r5
628	CACHE_DOWNALGN r3
629
630	PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
631	/* The manual says reading vscr can take some time - do
632	* read it here (into a volatile vector register) while
633	* we wait for cache blocks to be allocated
634	*/
635	mfvscr v0
636	S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
637	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
638	S_VSCR_VRSAVE r0, v0, r3, r5
639
640	1:
641
642	/* LOAD NON-VOLATILE REGISTERS */
643
644	/* Advance past vrsave/vscr area */
645	addi r4, r4, PPC_CACHE_ALIGNMENT
646	L_VSCR_VRSAVE r4, r0, v0
647	CMP_BASES r4, r5, r6, r7, r10
648	L_V20TOV31 r4, r5, r6, r7, r10, r11
649
650	#ifndef IGNORE_VRSAVE
651	mtcr r12
652	#endif
653	blr
654
655	.global _CPU_Context_initialize_altivec
656	_CPU_Context_initialize_altivec:
657	CMPOFF r5
658	add r3, r3, r5
659	CACHE_DOWNALGN r3
660	lis r5, _CPU_altivec_vrsave_initval@ha
661	lwz r5, _CPU_altivec_vrsave_initval@l(r5)
662	stw r5, VRSAVE_OFF(r3)
663	lis r6, _CPU_altivec_vscr_initval@ha
664	lwz r6, _CPU_altivec_vscr_initval@l(r6)
665	stw r6, VSCR_OFF(r3)
666	blr
667
668	/*
669	* Change the initial value of VRSAVE.
670	* Can be used by initialization code if
671	* it is determined that code was compiled
672	* with -mvrsave=no. In this case, VRSAVE
673	* must be set to all-ones which causes this
674	* support code to save/restore all registers
675	* (only has an effect if IGNORE_VRSAVE is
676	* not defined -- otherwise all registers are
677	* saved/restored anyways).
678	*/
679	.global _CPU_altivec_set_vrsave_initval
680	_CPU_altivec_set_vrsave_initval:
681	lis r5, _CPU_altivec_vrsave_initval@ha
682	stw r3, _CPU_altivec_vrsave_initval@l(r5)
683	mtvrsave r3
684	blr
685
686	#ifdef ALTIVEC_TESTING
687	.global msr_VE_on
688	msr_VE_on:
689	mfmsr r3
690	oris r3, r3, 1<<(31-6-16)
691	mtmsr r3
692	blr
693
694	.global msr_VE_off
695	msr_VE_off:
696	mfmsr r3
697	lis r4, 1<<(31-6-16)
698	andc r3, r3, r4
699	mtmsr r3
700	blr
701
702
703	.global mfvrsave
704	mfvrsave:
705	mfvrsave r3
706	blr
707
708	.global mtvrsave
709	mtvrsave:
710	mtvrsave r3
711	blr
712
713	/* Load all vector registers from memory area.
714	* NOTE: This routine is not strictly ABI compliant --
715	* it guarantees that volatile vector registers
716	* have certain values on exit!
717	*/
718	.global _CPU_altivec_load_all
719	_CPU_altivec_load_all:
720	/* Align address up to next cache-line boundary */
721	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
722	CACHE_DOWNALGN r3
723	#ifndef IGNORE_VRSAVE
724	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
725	* when testing if we really should do the load/store operation.
726	*/
727	mfcr r12
728	#endif
729
730	/* Try to preload 1st line (where vscr and vrsave are stored) */
731	dcbt 0, r3
732	/* Point to start of general vector-register area */
733	addi r3, r3, PPC_CACHE_ALIGNMENT
734	/* Start preloading 2nd line (where first two vectors are) */
735	dcbt 0, r3
736	L_VSCR_VRSAVE r3, r0, v0
737	CMP_BASES r3, r4, r5, r6, r10
738	/* Start preloading 3rd line (where vectors 3 and 4 are) */
739	dcbt 0, r5
740	L_V0TOV31 r3, r4, r5, r6, r10, r11
741
742	#ifndef IGNORE_VRSAVE
743	mtcr r12
744	#endif
745	blr
746
747	.global _CPU_altivec_save_all
748	_CPU_altivec_save_all:
749	/* Align address up to next cache-line boundary */
750	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
751	CACHE_DOWNALGN r3
752
753	#ifndef IGNORE_VRSAVE
754	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
755	* when testing if we really should do the load/store operation.
756	*/
757	mfcr r12
758	#endif
759
760	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
761	/* r0 now contains VRSAVE, r3 still the aligned memory area
762	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
763	* respectively. r10 holds zero
764	*/
765	S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
766	mfvscr v0
767	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
768	S_VSCR_VRSAVE r0, v0, r3, r11
769
770	#ifndef IGNORE_VRSAVE
771	/* Restore CRC */
772	mtcr r12
773	#endif
774	blr
775
776
777	#if 0
778	.gnu_attribute 4,1
779	.gnu_attribute 8,1
780	#endif
781
782	#endif
783	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format