Context Navigation

vec_sup_asm.S @ 5826a1b

4.115

Last change on this file since 5826a1b was 1869bb7, checked in by Sebastian Huber <sebastian.huber@…>, on 05/18/12 at 13:47:23

powerpc: Simplify context switch

PowerPC cores with the SPE (Signal Processing Extension) have 64-bit
general-purpose registers. The SPE context switch code has been merged
with the standard context switch code. The context switch may use cache
operations to increase the performance. It will be ensured that the
context is 32-byte aligned (PPC_DEFAULT_CACHE_LINE_SIZE). This
increases the overall memory size of the context area in the thread
control block slightly. The general-purpose registers GPR2 and GPR13
are no longer part of the context. The BSP must initialize these
registers during startup (usually initialized by the eabi() function).

The new BSP option BSP_USE_DATA_CACHE_BLOCK_TOUCH can be used to enable
the dcbt instruction in the context switch.

The new BSP option BSP_USE_SYNC_IN_CONTEXT_SWITCH can be used to enable
sync and isync instructions in the context switch. This should be not
necessary in most cases.

Property mode set to 100644

File size: 24.5 KB

Line
1	#ifdef __ALTIVEC__
2
3	/* Altivec support for RTEMS; vector register context management. */
4
5	/*
6	* Authorship
7	* ----------
8	* This software was created by
9	* Till Straumann <strauman@slac.stanford.edu>, 2009,
10	* Stanford Linear Accelerator Center, Stanford University.
11	*
12	* Acknowledgement of sponsorship
13	* ------------------------------
14	* This software was produced by
15	* the Stanford Linear Accelerator Center, Stanford University,
16	* under Contract DE-AC03-76SFO0515 with the Department of Energy.
17	*
18	* Government disclaimer of liability
19	* ----------------------------------
20	* Neither the United States nor the United States Department of Energy,
21	* nor any of their employees, makes any warranty, express or implied, or
22	* assumes any legal liability or responsibility for the accuracy,
23	* completeness, or usefulness of any data, apparatus, product, or process
24	* disclosed, or represents that its use would not infringe privately owned
25	* rights.
26	*
27	* Stanford disclaimer of liability
28	* --------------------------------
29	* Stanford University makes no representations or warranties, express or
30	* implied, nor assumes any liability for the use of this software.
31	*
32	* Stanford disclaimer of copyright
33	* --------------------------------
34	* Stanford University, owner of the copyright, hereby disclaims its
35	* copyright and all other rights in this software. Hence, anyone may
36	* freely use it for any purpose without restriction.
37	*
38	* Maintenance of notices
39	* ----------------------
40	* In the interest of clarity regarding the origin and status of this
41	* SLAC software, this and all the preceding Stanford University notices
42	* are to remain affixed to any copy or derivative of this software made
43	* or distributed by the recipient and are to be affixed to any copy of
44	* software made or distributed by the recipient that contains a copy or
45	* derivative of this software.
46	*
47	* ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48	*/
49
50
51	#include <rtems/powerpc/powerpc.h>
52
53	#ifndef PPC_CACHE_ALIGNMENT
54	#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55	#endif
56
57	#define ALTIVEC_TESTING
58
59	#if PPC_CACHE_ALIGNMENT != 32
60	#error "Altivec support assumes cache-line size is 32 bytes!"
61	#else
62	#undef LD_PPC_CACHE_ALIGNMENT
63	#define LD_PPC_CACHE_ALIGNMENT 5
64	#endif
65
66	.set v0, 0
67	.set v8, 8
68	.set v16, 16
69	.set v20, 20
70	.set v24, 24
71	.set v28, 28
72
73	.set r0, 0
74	.set r3, 3
75	.set r4, 4
76	.set r5, 5
77	.set r6, 6
78	.set r7, 7
79
80	.set r10, 10
81	.set r11, 11
82	.set r12, 12
83
84	.set cr5, 5
85
86	.set VECSIZE, 16
87
88	.set VRSAVE_INIT_VAL, 0
89	.set VSCR_INIT_VAL, 0
90
91	.set VRSAVE_OFF, 16
92	.set VSCR_OFF, 16+12
93
94	.set ds0, 0
95
96	/* Block size for dst -- in units of 16-bytes */
97	.set BSIZE, 2 /* = 32 bytes */
98	.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
99	.set BSTRIDE, 32 /* bytes */
100
101	.data
102
103	.global _CPU_altivec_vrsave_initval
104	_CPU_altivec_vrsave_initval:
105	.long 0
106
107	.global _CPU_altivec_vscr_initval
108	_CPU_altivec_vscr_initval:
109	.long 0
110
111	.text
112
113	.extern _CPU_altivec_psim_cpu
114	.extern _CPU_altivec_ctxt_off
115
116	.macro CMPOFF _B0
117	lis \_B0, _CPU_altivec_ctxt_off@ha
118	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
119	.endm
120
121	/* Conditionally load or store a vector _VR to
122	* EA(_R1\|0 + _R2)
123	* If bit _VR (corresponding to _VR) is set in CRC
124	* then the load/store is performed but otherwise
125	* it is skipped.
126	* If compiled with IGNORE_VRSAVE defined then
127	* the load/store is done unconditionally.
128	*
129	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
130	* _VR : target vector register
131	* _R1 : base register (NOTE: _R1=r0 uses a
132	* implicit ZERO constant, not the contents
133	* of r0) for address computation.
134	* _R2 : 'offset' register for address computation.
135	*
136	* MODIFIES: _VR on output if a load operation is performed.
137	* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
138	* defined.
139	*/
140	.macro LDST _OPCODE, _VR, _R1, _R2
141	#ifndef IGNORE_VRSAVE
142	bc 4, \_VR, 111f
143	#endif
144	\_OPCODE \_VR, \_R1, \_R2
145	111:
146	.endm
147
148	/*
149	* Load or store four 'adjacent' vector registers.
150	*
151	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
152	* _VR : target vector register
153	* _R1 : base register (NOTE: _R1=r0 uses a
154	* implicit ZERO constant, not the contents
155	* of r0) for address computation.
156	* _B0 : base register 0
157	* _B1 : base register 1
158	* _B2 : base register 2
159	* _B3 : base register 3
160	* _RO : offset register
161	*
162	* memory addresses for _VR, _VR+1, _VR+2, _VR+3
163	* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
164	*
165	* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
166	* operation is performed.
167	* IMPLICIT USE: see LDST
168	*/
169	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
170	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
171	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
172	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
173	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
174	.endm
175
176	/*
177	* Preload/zero two cache lines and save 4 vector registers
178	* to memory.
179	* Note that the cache operation targets memory past the
180	* current storage area which should hopefully hit when
181	* This same code is executed on the next two cache lines...
182	*
183	* This code effectively does
184	* dcbz (_B0 + 64)
185	* dcbz (_B0 + 64 + 32)
186	* stvx _VF+0, (_B0+ 0)
187	* stvx _VF+1, (_B0+16)
188	* stvx _VF+2, (_B0+32)
189	* stvx _VF+3, (_B0+48)
190	*
191	* _LRU: may be 'l' or empty. The former variant should be
192	* used when it is conceivable that the memory area is
193	* unlikely to be used in the near future thus making
194	* it a candidate for early eviction from the caches.
195	*
196	* If it is likely that the memory area is reused soon
197	* (e.g., save/restore across ISR execution) then the
198	* 'stvx' opcode (w/o 'l' suffix) should be used.
199	*
200	* _VR: first of four target vector registers; _VR+0,
201	* _VR+1, _VR+2, _VR+3 are saved.
202	*
203	* _BO: base address of memory area.
204	* _B1: should contain _B0+16 on entry
205	* _B2: should contain _B0+32 on entry
206	* _B3: should contain _B0+48 on entry
207	*
208	* _O1: contains the offset where the four vectors are
209	* stored.
210	* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
211	* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
212	* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
213	* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
214	* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
215	* used to address the two cache-lines past the
216	* current memory area.
217	*
218	* MODIFIES: _O2; contains _O1 + 64 after execution of this
219	* code.
220	*
221	* NOTES: a different set of four vectors can be addressed
222	* simply by changing the one offset register _O1.
223	*
224	* Saving more than 4 registers can simply be
225	* achieved by expanding this macro multiple
226	* times with _O1 and _O2 swapped (new _O1
227	* becomes _O2 = old _O1 + 64) thus stepping
228	* through the memory area.
229	*
230	*/
231	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
232	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
233	dcbz \_B0, \_O2
234	dcbz \_B2, \_O2
235	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
236	.endm
237
238	/*
239	* Save eight vector registers by expanding S4VEC_P twice.
240	* See notes for S4VEC_P above.
241	*
242	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
243	*
244	* MODIFIES: After execution,
245	* _O2 contains original _O1 + 64,
246	* _O1 contains original _O1 + 128
247	*
248	* NOTES: Expanding this macro multiple times lets you save
249	* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
250	*/
251	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
252	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
253	/* Note that the roles of _O1 and _O2 are swapped here */
254	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
255	.endm
256
257	/*
258	* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
259	*
260	* See notes above (for S4VEC_P).
261	*
262	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
263	* MODIFIES: _O1 contains original _O1 + 256
264	* _O2 contains original _O1 + 256 - 64
265	*/
266	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
267	S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
268	S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
270	.endm
271
272	/*
273	* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
274	*
275	* See notes above (for S4VEC_P, S_V0TOV19).
276	*
277	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
278	* MODIFIES: _O1 contains original _O1 + 128
279	* _O2 contains original _O1 + 128 - 64
280	*/
281	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
282	S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
283	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
284	.endm
285
286	/*
287	* Save all registers to memory area
288	*
289	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
290	* MODIFIES: _O1 contains original _O1 + 512
291	* _O2 contains original _O1 + 512 - 64
292	*/
293	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
294	S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
295	S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296	S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297	S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298	LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
299	.endm
300
301
302	/*
303	* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
304	* We can pass either of them as arguments to another macro which
305	* allows us to decide if the main macro uses dcbt or not when
306	* we expand it...
307	*/
308	.macro DO_DCBT _RA, _RB
309	dcbt \_RA, \_RB
310	.endm
311
312	.macro NO_DCBT _RA, _RB
313	.endm
314
315	/*
316	* NOTE REGARDING dcbt VS dst
317	*
318	* Preloading the cache with memory areas that we soon need
319	* can be done either using 'dcbt' or 'dst' instructions
320	* "ahead of time".
321	* When experimenting (on a mpc7457) I found that the 'dst'
322	* stream instruction was very efficient if there is enough
323	* time to read ahead. It works well when we do a context
324	* switch:
325	*
326	* 1) start DST on new context to be loaded
327	* 2) save old context to memory
328	* 3) load new context from memory
329	*
330	* Because of the interleaved step 2) dst works nicely and
331	* 3) finds what it needs in the cache.
332	*
333	* However, in a situation when there is not much time
334	* to start the DST, e.g., because we want to restore
335	* a context out of the blue (e.g., after returning
336	* from and ISR):
337	*
338	* 1) save volatile registers to memory/stack
339	* 2) execute ISR
340	* 3) might do a task context switch
341	* 4) when returned to old task context then
342	* reload volatile registers from memory/stack.
343	*
344	* In this situation, preloading the target memory before
345	* or after step 1) makes obviously no sense because after
346	* 1) the registers area is most likely in the cache already.
347	*
348	* Starting preload after 2) doesn't make much sense either.
349	* If ISR doesn't lead to a context switch then it is quite
350	* likely that the register area is still in the cache.
351	* OTOTH, if a context switch happens then the preload after 2)
352	* might be useless.
353	*
354	* This leaves us at step 4) where we want to load immediately.
355	* In this case, I found that 'dcbt' works more efficiently
356	* so that's what we use when restoring volatile registers.
357	*
358	* When restoring the non-volatile VRs during a 'normal'
359	* context switch then we shall use DST (and no dcbt).
360	*/
361
362	/*
363	* Symmetric to S4VEC_P above but addresses loading four
364	* vector registers from memory.
365	*
366	* Touches two cache lines past the current memory area
367	* and loads four vectors from the current area.
368	*
369	* Optionally, the DCBT operation may be omitted
370	* (when expanding with _DCBT=NO_DCBT).
371	* This is useful if the cache was already preloaded
372	* by another means (dst instruction).
373	*
374	* NOTE: We always use the 'LRU' form of lvx: lvxl,
375	* because we deem it unlikely that the context
376	* that was just loaded has to be saved again
377	* to memory in the immediate future.
378	*
379	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
380	* as explained above.
381	*
382	* MODIFIES: _O2 contains original _O1 + 64.
383	* _VR.._VR+3 loaded from memory.
384	*/
385	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
386	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
387	/* preload/touch 2 lines at offset 64 from _B0 */
388	\_DCBT \_B0, \_O2
389	\_DCBT \_B2, \_O2
390	/* load four vectors at off set 0 from _B0 */
391	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
392	.endm
393
394	/*
395	* Symmetric to S8VEC_P; loads 8 vector registers
396	* from memory -- see comments above...
397	*
398	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
399	* as explained above.
400	*
401	* MODIFIES: _O1 contains original _O1 + 128.
402	* _O2 contains original _O1 + 64.
403	* _VR.._VR+7 loaded from memory.
404	*/
405	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
406	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
407	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
408	.endm
409
410	/*
411	* Load volatile vector registers v0..v19 employing
412	* the DCBT to preload the cache. The rationale for
413	* using DCBT here but not when restoring non-volatile
414	* registers is explained above, see
415	*
416	* "NOTE REGARDING dcbt VS dst"
417	*
418	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
419	* as explained above.
420	*
421	* MODIFIES: _O1 contains original _O1 + 256.
422	* _O2 contains original _O1 + 256 - 64.
423	* VR0..VR19 loaded from memory.
424	*/
425	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
426	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
427	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428	LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
429	.endm
430
431	/*
432	* Load non-volatile vector registers v20..v31.
433	* Note that no DCBT is performed since we use
434	* DST for preloading the cache during a context
435	* switch, see
436	*
437	* "NOTE REGARDING dcbt VS dst"
438	*
439	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
440	* as explained above.
441	*
442	* MODIFIES: _O1 contains original _O1 + 128.
443	* _O2 contains original _O1 + 128 - 64.
444	* VR20..VR31 loaded from memory.
445	*/
446	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
447	L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
448	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
449	.endm
450
451	/*
452	* Load all registers from memory area.
453	*/
454	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
455	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
456	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457	L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458	L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
460	.endm
461
462	/*
463	* Compute
464	* _B1 = _B0 + 16
465	* _B2 = _B0 + 32
466	* _B3 = _B0 + 48
467	* and load
468	* _RO = 0
469	*
470	* convenience macro to be expanded before
471	* any of the load/store macros that use
472	* four base addresses etc.
473	*
474	* INPUT: _B0 = cache-aligned start of memory area
475	*
476	* MODIFIES: _B1, _B2, _B3, _RO as described above.
477	*/
478	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
479	addi \_B1, \_B0, 1*VECSIZE
480	addi \_B2, \_B0, 2*VECSIZE
481	addi \_B3, \_B0, 3*VECSIZE
482	li \_RO, 0
483	.endm
484
485	/*
486	* Prepare for saving general vector registers.
487	*
488	* If not built with #define IGNORE_VRSAVE then
489	*
490	* 1) copy vrsave to CRC
491	*
492	* endif
493	*
494	* 2) copy vrsave to _VRSAVE_REG
495	* 3) preload/zero cache line where vrsave and vscr are stored.
496	* 4) compute base adresses from _B0
497	* 5) preload/zero first two cache lines (remember that the
498	* first S8VEC_P starts preloading/zeroing at offset 64).
499	*
500	* INPUT: 'vrsave' register, _B0 (base address of memory area)
501	* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
502	* _B0 = original _BO + 32
503	* _B1 = original _B0 + 32 + 16,
504	* _B2 = original _B0 + 32 + 32,
505	* _B3 = original _B0 + 32 + 48,
506	* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
507	*/
508	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
509	mfvrsave \_VRSAVE_REG
510	#ifndef IGNORE_VRSAVE
511	mtcr \_VRSAVE_REG
512	#endif
513	dcbz 0, \_B0
514	addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
515	dcbz 0, \_B0
516	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
517	dcbz 0, \_B2
518	.endm
519
520	/*
521	* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
522	* must have been loaded from 'vrsave' and 'vscr', respectively,
523	* prior to expanding this macro.
524	*
525	* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
526	* _VSCR_VREG VR holding 'vscr' contents
527	* _B0 cache-aligned (base) address of memory area.
528	* MODIFIES: _SCRATCH_REG
529	*/
530	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
531	stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
532	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
533	stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
534	.endm
535
536	/*
537	* Load 'vrsave' and 'vscr' from memory.
538	*
539	* INPUTS: _B0 cache-aligned (base) address of memory area.
540	* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
541	* 'vscr', 'vrsave'.
542	* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
543	* with IGNORE_VRSAVE undefined).
544	*/
545	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
546	lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
547	mtvrsave \_SCRATCH_REG
548	#ifndef IGNORE_VRSAVE
549	mtcr \_SCRATCH_REG
550	#endif
551	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
552	lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
553	mtvscr \_SCRATCH_VREG
554	.endm
555
556	/*
557	* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
558	*
559	* INPUT: _B0
560	* MODIFIES: _B0 (as stated above)
561	*/
562	.macro CACHE_DOWNALGN _B0
563	rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
564	.endm
565
566	.text
567
568	.global _CPU_save_altivec_volatile
569	_CPU_save_altivec_volatile:
570	/* Align address up to next cache-line boundary */
571	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
572	CACHE_DOWNALGN r3
573
574	#ifndef IGNORE_VRSAVE
575	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
576	* when testing if we really should do the load/store operation.
577	*/
578	mfcr r12
579	#endif
580
581	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
582	/* r0 now contains VRSAVE, r3 still the aligned memory area
583	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
584	* respectively. r10 holds zero
585	*/
586	S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
587	mfvscr v0
588	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
589	S_VSCR_VRSAVE r0, v0, r3, r11
590
591	#ifndef IGNORE_VRSAVE
592	/* Restore CRC */
593	mtcr r12
594	#endif
595	blr
596
597	.global _CPU_load_altivec_volatile
598	_CPU_load_altivec_volatile:
599	/* Align address up to next cache-line boundary */
600	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
601	CACHE_DOWNALGN r3
602	#ifndef IGNORE_VRSAVE
603	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
604	* when testing if we really should do the load/store operation.
605	*/
606	mfcr r12
607	#endif
608
609	/* Try to preload 1st line (where vscr and vrsave are stored) */
610	dcbt 0, r3
611	/* Point to start of general vector-register area */
612	addi r3, r3, PPC_CACHE_ALIGNMENT
613	/* Start preloading 2nd line (where first two vectors are) */
614	dcbt 0, r3
615	L_VSCR_VRSAVE r3, r0, v0
616	CMP_BASES r3, r4, r5, r6, r10
617	/* Start preloading 3rd line (where vectors 3 and 4 are) */
618	dcbt 0, r5
619	L_V0TOV19 r3, r4, r5, r6, r10, r11
620
621	#ifndef IGNORE_VRSAVE
622	mtcr r12
623	#endif
624	blr
625
626	.global _CPU_Context_switch_altivec
627	_CPU_Context_switch_altivec:
628
629	/* fetch offset of altivec area in context */
630	CMPOFF r5
631	/* down-align 'to' area to cache-line boundary */
632	add r4, r4, r5
633	CACHE_DOWNALGN r4
634
635	/* Check for PSIM */
636	lis r6, _CPU_altivec_psim_cpu@ha
637	lwz r6, _CPU_altivec_psim_cpu@l(r6)
638	cmpli 0, r6, 0
639	bne 1f
640	/* Skip data-stream instructions on PSIM (not implemented) */
641	dssall
642	/* Pre-load new context into cache */
643	lis r6, (BSIZE<<(24-16)) \| (BCNT<<(16-16))
644	ori r6, r6, BSTRIDE
645	dstt r4, r6, ds0
646	1:
647
648	#ifndef IGNORE_VRSAVE
649	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
650	* when testing if we really should do the load/store operation.
651	*/
652	mfcr r12
653	#endif
654
655	/* Is 'from' context == NULL ? (then we just do a 'restore') */
656	cmpli 0, r3, 0
657	beq 1f /* yes: skip saving 'from' context */
658
659	/* SAVE NON-VOLATILE REGISTERS */
660
661	/* Compute aligned destination pointer (r5 still holds offset
662	* to 'altivec' area in context)
663	*/
664	add r3, r3, r5
665	CACHE_DOWNALGN r3
666
667	PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
668	/* The manual says reading vscr can take some time - do
669	* read it here (into a volatile vector register) while
670	* we wait for cache blocks to be allocated
671	*/
672	mfvscr v0
673	S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
674	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
675	S_VSCR_VRSAVE r0, v0, r3, r5
676
677	1:
678
679	/* LOAD NON-VOLATILE REGISTERS */
680
681	/* Advance past vrsave/vscr area */
682	addi r4, r4, PPC_CACHE_ALIGNMENT
683	L_VSCR_VRSAVE r4, r0, v0
684	CMP_BASES r4, r5, r6, r7, r10
685	L_V20TOV31 r4, r5, r6, r7, r10, r11
686
687	#ifndef IGNORE_VRSAVE
688	mtcr r12
689	#endif
690	blr
691
692	.global _CPU_Context_initialize_altivec
693	_CPU_Context_initialize_altivec:
694	CMPOFF r5
695	add r3, r3, r5
696	CACHE_DOWNALGN r3
697	lis r5, _CPU_altivec_vrsave_initval@ha
698	lwz r5, _CPU_altivec_vrsave_initval@l(r5)
699	stw r5, VRSAVE_OFF(r3)
700	lis r6, _CPU_altivec_vscr_initval@ha
701	lwz r6, _CPU_altivec_vscr_initval@l(r6)
702	stw r6, VSCR_OFF(r3)
703	blr
704
705	/*
706	* Change the initial value of VRSAVE.
707	* Can be used by initialization code if
708	* it is determined that code was compiled
709	* with -mvrsave=no. In this case, VRSAVE
710	* must be set to all-ones which causes this
711	* support code to save/restore all registers
712	* (only has an effect if IGNORE_VRSAVE is
713	* not defined -- otherwise all registers are
714	* saved/restored anyways).
715	*/
716	.global _CPU_altivec_set_vrsave_initval
717	_CPU_altivec_set_vrsave_initval:
718	lis r5, _CPU_altivec_vrsave_initval@ha
719	stw r3, _CPU_altivec_vrsave_initval@l(r5)
720	mtvrsave r3
721	blr
722
723	#ifdef ALTIVEC_TESTING
724	.global msr_VE_on
725	msr_VE_on:
726	mfmsr r3
727	oris r3, r3, 1<<(31-6-16)
728	mtmsr r3
729	blr
730
731	.global msr_VE_off
732	msr_VE_off:
733	mfmsr r3
734	lis r4, 1<<(31-6-16)
735	andc r3, r3, r4
736	mtmsr r3
737	blr
738
739
740	.global mfvrsave
741	mfvrsave:
742	mfvrsave r3
743	blr
744
745	.global mtvrsave
746	mtvrsave:
747	mtvrsave r3
748	blr
749
750	/* Load all vector registers from memory area.
751	* NOTE: This routine is not strictly ABI compliant --
752	* it guarantees that volatile vector registers
753	* have certain values on exit!
754	*/
755	.global _CPU_altivec_load_all
756	_CPU_altivec_load_all:
757	/* Align address up to next cache-line boundary */
758	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
759	CACHE_DOWNALGN r3
760	#ifndef IGNORE_VRSAVE
761	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
762	* when testing if we really should do the load/store operation.
763	*/
764	mfcr r12
765	#endif
766
767	/* Try to preload 1st line (where vscr and vrsave are stored) */
768	dcbt 0, r3
769	/* Point to start of general vector-register area */
770	addi r3, r3, PPC_CACHE_ALIGNMENT
771	/* Start preloading 2nd line (where first two vectors are) */
772	dcbt 0, r3
773	L_VSCR_VRSAVE r3, r0, v0
774	CMP_BASES r3, r4, r5, r6, r10
775	/* Start preloading 3rd line (where vectors 3 and 4 are) */
776	dcbt 0, r5
777	L_V0TOV31 r3, r4, r5, r6, r10, r11
778
779	#ifndef IGNORE_VRSAVE
780	mtcr r12
781	#endif
782	blr
783
784	.global _CPU_altivec_save_all
785	_CPU_altivec_save_all:
786	/* Align address up to next cache-line boundary */
787	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
788	CACHE_DOWNALGN r3
789
790	#ifndef IGNORE_VRSAVE
791	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
792	* when testing if we really should do the load/store operation.
793	*/
794	mfcr r12
795	#endif
796
797	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
798	/* r0 now contains VRSAVE, r3 still the aligned memory area
799	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
800	* respectively. r10 holds zero
801	*/
802	S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
803	mfvscr v0
804	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
805	S_VSCR_VRSAVE r0, v0, r3, r11
806
807	#ifndef IGNORE_VRSAVE
808	/* Restore CRC */
809	mtcr r12
810	#endif
811	blr
812
813
814	#if 0
815	.gnu_attribute 4,1
816	.gnu_attribute 8,1
817	#endif
818
819	#endif
820	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format