Context Navigation

vec_sup_asm.S @ 9964895

Last change on this file since 9964895 was 4fd1ff0f, checked in by Sebastian Huber <sebastian.huber@…>, on 03/26/18 at 04:57:10

bsps/powerpc: Move AltiVec? support to bsps

This patch is a part of the BSP source reorganization.

Update #3285.

Property mode set to 100644

File size: 24.6 KB

Line
1	#ifdef __ALTIVEC__
2
3	/* Altivec support for RTEMS; vector register context management. */
4
5	/*
6	* Authorship
7	* ----------
8	* This software was created by
9	* Till Straumann <strauman@slac.stanford.edu>, 2009,
10	* Stanford Linear Accelerator Center, Stanford University.
11	*
12	* Acknowledgement of sponsorship
13	* ------------------------------
14	* This software was produced by
15	* the Stanford Linear Accelerator Center, Stanford University,
16	* under Contract DE-AC03-76SFO0515 with the Department of Energy.
17	*
18	* Government disclaimer of liability
19	* ----------------------------------
20	* Neither the United States nor the United States Department of Energy,
21	* nor any of their employees, makes any warranty, express or implied, or
22	* assumes any legal liability or responsibility for the accuracy,
23	* completeness, or usefulness of any data, apparatus, product, or process
24	* disclosed, or represents that its use would not infringe privately owned
25	* rights.
26	*
27	* Stanford disclaimer of liability
28	* --------------------------------
29	* Stanford University makes no representations or warranties, express or
30	* implied, nor assumes any liability for the use of this software.
31	*
32	* Stanford disclaimer of copyright
33	* --------------------------------
34	* Stanford University, owner of the copyright, hereby disclaims its
35	* copyright and all other rights in this software. Hence, anyone may
36	* freely use it for any purpose without restriction.
37	*
38	* Maintenance of notices
39	* ----------------------
40	* In the interest of clarity regarding the origin and status of this
41	* SLAC software, this and all the preceding Stanford University notices
42	* are to remain affixed to any copy or derivative of this software made
43	* or distributed by the recipient and are to be affixed to any copy of
44	* software made or distributed by the recipient that contains a copy or
45	* derivative of this software.
46	*
47	* ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48	*/
49
50
51	#include <rtems/powerpc/powerpc.h>
52
53	#ifndef PPC_CACHE_ALIGNMENT
54	#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55	#endif
56
57	#define ALTIVEC_TESTING
58
59	#if PPC_CACHE_ALIGNMENT != 32
60	#error "Altivec support assumes cache-line size is 32 bytes!"
61	#else
62	#undef LD_PPC_CACHE_ALIGNMENT
63	#define LD_PPC_CACHE_ALIGNMENT 5
64	#endif
65
66	.set v0, 0
67	.set v8, 8
68	.set v16, 16
69	.set v20, 20
70	.set v24, 24
71	.set v28, 28
72
73	.set r0, 0
74	.set r3, 3
75	.set r4, 4
76	/* Do not use r5, since this is used by _CPU_Context_switch() */
77	.set r6, 6
78	.set r7, 7
79	.set r8, 8
80	.set r9, 9
81	.set r10, 10
82	.set r11, 11
83	/* Do not use r12, since this is used by _CPU_Context_switch() */
84
85	.set cr5, 5
86
87	.set VECSIZE, 16
88
89	.set VRSAVE_INIT_VAL, 0
90	.set VSCR_INIT_VAL, 0
91
92	.set VRSAVE_OFF, 16
93	.set VSCR_OFF, 16+12
94
95	.set ds0, 0
96
97	/* Block size for dst -- in units of 16-bytes */
98	.set BSIZE, 2 /* = 32 bytes */
99	.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
100	.set BSTRIDE, 32 /* bytes */
101
102	.data
103
104	.global _CPU_altivec_vrsave_initval
105	_CPU_altivec_vrsave_initval:
106	.long 0
107
108	.global _CPU_altivec_vscr_initval
109	_CPU_altivec_vscr_initval:
110	.long 0
111
112	.text
113
114	.extern _CPU_altivec_psim_cpu
115	.extern _CPU_altivec_ctxt_off
116
117	.macro CMPOFF _B0
118	lis \_B0, _CPU_altivec_ctxt_off@ha
119	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
120	.endm
121
122	/* Conditionally load or store a vector _VR to
123	* EA(_R1\|0 + _R2)
124	* If bit _VR (corresponding to _VR) is set in CRC
125	* then the load/store is performed but otherwise
126	* it is skipped.
127	* If compiled with IGNORE_VRSAVE defined then
128	* the load/store is done unconditionally.
129	*
130	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
131	* _VR : target vector register
132	* _R1 : base register (NOTE: _R1=r0 uses a
133	* implicit ZERO constant, not the contents
134	* of r0) for address computation.
135	* _R2 : 'offset' register for address computation.
136	*
137	* MODIFIES: _VR on output if a load operation is performed.
138	* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
139	* defined.
140	*/
141	.macro LDST _OPCODE, _VR, _R1, _R2
142	#ifndef IGNORE_VRSAVE
143	bc 4, \_VR, 111f
144	#endif
145	\_OPCODE \_VR, \_R1, \_R2
146	111:
147	.endm
148
149	/*
150	* Load or store four 'adjacent' vector registers.
151	*
152	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
153	* _VR : target vector register
154	* _R1 : base register (NOTE: _R1=r0 uses a
155	* implicit ZERO constant, not the contents
156	* of r0) for address computation.
157	* _B0 : base register 0
158	* _B1 : base register 1
159	* _B2 : base register 2
160	* _B3 : base register 3
161	* _RO : offset register
162	*
163	* memory addresses for _VR, _VR+1, _VR+2, _VR+3
164	* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
165	*
166	* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
167	* operation is performed.
168	* IMPLICIT USE: see LDST
169	*/
170	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
171	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
172	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
173	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
174	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
175	.endm
176
177	/*
178	* Preload/zero two cache lines and save 4 vector registers
179	* to memory.
180	* Note that the cache operation targets memory past the
181	* current storage area which should hopefully hit when
182	* This same code is executed on the next two cache lines...
183	*
184	* This code effectively does
185	* dcbz (_B0 + 64)
186	* dcbz (_B0 + 64 + 32)
187	* stvx _VF+0, (_B0+ 0)
188	* stvx _VF+1, (_B0+16)
189	* stvx _VF+2, (_B0+32)
190	* stvx _VF+3, (_B0+48)
191	*
192	* _LRU: may be 'l' or empty. The former variant should be
193	* used when it is conceivable that the memory area is
194	* unlikely to be used in the near future thus making
195	* it a candidate for early eviction from the caches.
196	*
197	* If it is likely that the memory area is reused soon
198	* (e.g., save/restore across ISR execution) then the
199	* 'stvx' opcode (w/o 'l' suffix) should be used.
200	*
201	* _VR: first of four target vector registers; _VR+0,
202	* _VR+1, _VR+2, _VR+3 are saved.
203	*
204	* _BO: base address of memory area.
205	* _B1: should contain _B0+16 on entry
206	* _B2: should contain _B0+32 on entry
207	* _B3: should contain _B0+48 on entry
208	*
209	* _O1: contains the offset where the four vectors are
210	* stored.
211	* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
212	* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
213	* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
214	* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
215	* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
216	* used to address the two cache-lines past the
217	* current memory area.
218	*
219	* MODIFIES: _O2; contains _O1 + 64 after execution of this
220	* code.
221	*
222	* NOTES: a different set of four vectors can be addressed
223	* simply by changing the one offset register _O1.
224	*
225	* Saving more than 4 registers can simply be
226	* achieved by expanding this macro multiple
227	* times with _O1 and _O2 swapped (new _O1
228	* becomes _O2 = old _O1 + 64) thus stepping
229	* through the memory area.
230	*
231	*/
232	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
233	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
234	dcbz \_B0, \_O2
235	dcbz \_B2, \_O2
236	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
237	.endm
238
239	/*
240	* Save eight vector registers by expanding S4VEC_P twice.
241	* See notes for S4VEC_P above.
242	*
243	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
244	*
245	* MODIFIES: After execution,
246	* _O2 contains original _O1 + 64,
247	* _O1 contains original _O1 + 128
248	*
249	* NOTES: Expanding this macro multiple times lets you save
250	* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
251	*/
252	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
253	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
254	/* Note that the roles of _O1 and _O2 are swapped here */
255	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
256	.endm
257
258	/*
259	* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
260	*
261	* See notes above (for S4VEC_P).
262	*
263	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
264	* MODIFIES: _O1 contains original _O1 + 256
265	* _O2 contains original _O1 + 256 - 64
266	*/
267	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
268	S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269	S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
270	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
271	.endm
272
273	/*
274	* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
275	*
276	* See notes above (for S4VEC_P, S_V0TOV19).
277	*
278	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
279	* MODIFIES: _O1 contains original _O1 + 128
280	* _O2 contains original _O1 + 128 - 64
281	*/
282	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
283	S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
284	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
285	.endm
286
287	/*
288	* Save all registers to memory area
289	*
290	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
291	* MODIFIES: _O1 contains original _O1 + 512
292	* _O2 contains original _O1 + 512 - 64
293	*/
294	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
295	S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296	S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297	S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298	S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
299	LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
300	.endm
301
302
303	/*
304	* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
305	* We can pass either of them as arguments to another macro which
306	* allows us to decide if the main macro uses dcbt or not when
307	* we expand it...
308	*/
309	.macro DO_DCBT _RA, _RB
310	dcbt \_RA, \_RB
311	.endm
312
313	.macro NO_DCBT _RA, _RB
314	.endm
315
316	/*
317	* NOTE REGARDING dcbt VS dst
318	*
319	* Preloading the cache with memory areas that we soon need
320	* can be done either using 'dcbt' or 'dst' instructions
321	* "ahead of time".
322	* When experimenting (on a mpc7457) I found that the 'dst'
323	* stream instruction was very efficient if there is enough
324	* time to read ahead. It works well when we do a context
325	* switch:
326	*
327	* 1) start DST on new context to be loaded
328	* 2) save old context to memory
329	* 3) load new context from memory
330	*
331	* Because of the interleaved step 2) dst works nicely and
332	* 3) finds what it needs in the cache.
333	*
334	* However, in a situation when there is not much time
335	* to start the DST, e.g., because we want to restore
336	* a context out of the blue (e.g., after returning
337	* from and ISR):
338	*
339	* 1) save volatile registers to memory/stack
340	* 2) execute ISR
341	* 3) might do a task context switch
342	* 4) when returned to old task context then
343	* reload volatile registers from memory/stack.
344	*
345	* In this situation, preloading the target memory before
346	* or after step 1) makes obviously no sense because after
347	* 1) the registers area is most likely in the cache already.
348	*
349	* Starting preload after 2) doesn't make much sense either.
350	* If ISR doesn't lead to a context switch then it is quite
351	* likely that the register area is still in the cache.
352	* OTOTH, if a context switch happens then the preload after 2)
353	* might be useless.
354	*
355	* This leaves us at step 4) where we want to load immediately.
356	* In this case, I found that 'dcbt' works more efficiently
357	* so that's what we use when restoring volatile registers.
358	*
359	* When restoring the non-volatile VRs during a 'normal'
360	* context switch then we shall use DST (and no dcbt).
361	*/
362
363	/*
364	* Symmetric to S4VEC_P above but addresses loading four
365	* vector registers from memory.
366	*
367	* Touches two cache lines past the current memory area
368	* and loads four vectors from the current area.
369	*
370	* Optionally, the DCBT operation may be omitted
371	* (when expanding with _DCBT=NO_DCBT).
372	* This is useful if the cache was already preloaded
373	* by another means (dst instruction).
374	*
375	* NOTE: We always use the 'LRU' form of lvx: lvxl,
376	* because we deem it unlikely that the context
377	* that was just loaded has to be saved again
378	* to memory in the immediate future.
379	*
380	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
381	* as explained above.
382	*
383	* MODIFIES: _O2 contains original _O1 + 64.
384	* _VR.._VR+3 loaded from memory.
385	*/
386	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
387	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
388	/* preload/touch 2 lines at offset 64 from _B0 */
389	\_DCBT \_B0, \_O2
390	\_DCBT \_B2, \_O2
391	/* load four vectors at off set 0 from _B0 */
392	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
393	.endm
394
395	/*
396	* Symmetric to S8VEC_P; loads 8 vector registers
397	* from memory -- see comments above...
398	*
399	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
400	* as explained above.
401	*
402	* MODIFIES: _O1 contains original _O1 + 128.
403	* _O2 contains original _O1 + 64.
404	* _VR.._VR+7 loaded from memory.
405	*/
406	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
407	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
409	.endm
410
411	/*
412	* Load volatile vector registers v0..v19 employing
413	* the DCBT to preload the cache. The rationale for
414	* using DCBT here but not when restoring non-volatile
415	* registers is explained above, see
416	*
417	* "NOTE REGARDING dcbt VS dst"
418	*
419	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
420	* as explained above.
421	*
422	* MODIFIES: _O1 contains original _O1 + 256.
423	* _O2 contains original _O1 + 256 - 64.
424	* VR0..VR19 loaded from memory.
425	*/
426	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
427	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
429	LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
430	.endm
431
432	/*
433	* Load non-volatile vector registers v20..v31.
434	* Note that no DCBT is performed since we use
435	* DST for preloading the cache during a context
436	* switch, see
437	*
438	* "NOTE REGARDING dcbt VS dst"
439	*
440	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
441	* as explained above.
442	*
443	* MODIFIES: _O1 contains original _O1 + 128.
444	* _O2 contains original _O1 + 128 - 64.
445	* VR20..VR31 loaded from memory.
446	*/
447	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
448	L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
449	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
450	.endm
451
452	/*
453	* Load all registers from memory area.
454	*/
455	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
456	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458	L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459	L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
460	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
461	.endm
462
463	/*
464	* Compute
465	* _B1 = _B0 + 16
466	* _B2 = _B0 + 32
467	* _B3 = _B0 + 48
468	* and load
469	* _RO = 0
470	*
471	* convenience macro to be expanded before
472	* any of the load/store macros that use
473	* four base addresses etc.
474	*
475	* INPUT: _B0 = cache-aligned start of memory area
476	*
477	* MODIFIES: _B1, _B2, _B3, _RO as described above.
478	*/
479	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
480	addi \_B1, \_B0, 1*VECSIZE
481	addi \_B2, \_B0, 2*VECSIZE
482	addi \_B3, \_B0, 3*VECSIZE
483	li \_RO, 0
484	.endm
485
486	/*
487	* Prepare for saving general vector registers.
488	*
489	* If not built with #define IGNORE_VRSAVE then
490	*
491	* 1) copy vrsave to CRC
492	*
493	* endif
494	*
495	* 2) copy vrsave to _VRSAVE_REG
496	* 3) preload/zero cache line where vrsave and vscr are stored.
497	* 4) compute base adresses from _B0
498	* 5) preload/zero first two cache lines (remember that the
499	* first S8VEC_P starts preloading/zeroing at offset 64).
500	*
501	* INPUT: 'vrsave' register, _B0 (base address of memory area)
502	* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
503	* _B0 = original _BO + 32
504	* _B1 = original _B0 + 32 + 16,
505	* _B2 = original _B0 + 32 + 32,
506	* _B3 = original _B0 + 32 + 48,
507	* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
508	*/
509	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
510	mfvrsave \_VRSAVE_REG
511	#ifndef IGNORE_VRSAVE
512	mtcr \_VRSAVE_REG
513	#endif
514	dcbz 0, \_B0
515	addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
516	dcbz 0, \_B0
517	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
518	dcbz 0, \_B2
519	.endm
520
521	/*
522	* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
523	* must have been loaded from 'vrsave' and 'vscr', respectively,
524	* prior to expanding this macro.
525	*
526	* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
527	* _VSCR_VREG VR holding 'vscr' contents
528	* _B0 cache-aligned (base) address of memory area.
529	* MODIFIES: _SCRATCH_REG
530	*/
531	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
532	stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
533	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
534	stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
535	.endm
536
537	/*
538	* Load 'vrsave' and 'vscr' from memory.
539	*
540	* INPUTS: _B0 cache-aligned (base) address of memory area.
541	* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
542	* 'vscr', 'vrsave'.
543	* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
544	* with IGNORE_VRSAVE undefined).
545	*/
546	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
547	lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
548	mtvrsave \_SCRATCH_REG
549	#ifndef IGNORE_VRSAVE
550	mtcr \_SCRATCH_REG
551	#endif
552	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
553	lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
554	mtvscr \_SCRATCH_VREG
555	.endm
556
557	/*
558	* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
559	*
560	* INPUT: _B0
561	* MODIFIES: _B0 (as stated above)
562	*/
563	.macro CACHE_DOWNALGN _B0
564	rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
565	.endm
566
567	.text
568
569	.global _CPU_save_altivec_volatile
570	_CPU_save_altivec_volatile:
571	/* Align address up to next cache-line boundary */
572	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
573	CACHE_DOWNALGN r3
574
575	#ifndef IGNORE_VRSAVE
576	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
577	* when testing if we really should do the load/store operation.
578	*/
579	mfcr r9
580	#endif
581
582	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
583	/* r0 now contains VRSAVE, r3 still the aligned memory area
584	* and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
585	* respectively. r10 holds zero
586	*/
587	S_V0TOV19 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
588	mfvscr v0
589	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
590	S_VSCR_VRSAVE r0, v0, r3, r11
591
592	#ifndef IGNORE_VRSAVE
593	/* Restore CRC */
594	mtcr r9
595	#endif
596	blr
597
598	.global _CPU_load_altivec_volatile
599	_CPU_load_altivec_volatile:
600	/* Align address up to next cache-line boundary */
601	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
602	CACHE_DOWNALGN r3
603	#ifndef IGNORE_VRSAVE
604	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
605	* when testing if we really should do the load/store operation.
606	*/
607	mfcr r9
608	#endif
609
610	/* Try to preload 1st line (where vscr and vrsave are stored) */
611	dcbt 0, r3
612	/* Point to start of general vector-register area */
613	addi r3, r3, PPC_CACHE_ALIGNMENT
614	/* Start preloading 2nd line (where first two vectors are) */
615	dcbt 0, r3
616	L_VSCR_VRSAVE r3, r0, v0
617	CMP_BASES r3, r4, r8, r6, r10
618	/* Start preloading 3rd line (where vectors 3 and 4 are) */
619	dcbt 0, r8
620	L_V0TOV19 r3, r4, r8, r6, r10, r11
621
622	#ifndef IGNORE_VRSAVE
623	mtcr r9
624	#endif
625	blr
626
627	.global _CPU_Context_switch_altivec
628	_CPU_Context_switch_altivec:
629
630	/* fetch offset of altivec area in context */
631	CMPOFF r8
632	/* down-align 'to' area to cache-line boundary */
633	add r4, r4, r8
634	CACHE_DOWNALGN r4
635
636	/* Check for PSIM */
637	lis r6, _CPU_altivec_psim_cpu@ha
638	lwz r6, _CPU_altivec_psim_cpu@l(r6)
639	cmpli 0, r6, 0
640	bne 1f
641	/* Skip data-stream instructions on PSIM (not implemented) */
642	dssall
643	/* Pre-load new context into cache */
644	lis r6, (BSIZE<<(24-16)) \| (BCNT<<(16-16))
645	ori r6, r6, BSTRIDE
646	dstt r4, r6, ds0
647	1:
648
649	#ifndef IGNORE_VRSAVE
650	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
651	* when testing if we really should do the load/store operation.
652	*/
653	mfcr r9
654	#endif
655
656	/* Is 'from' context == NULL ? (then we just do a 'restore') */
657	cmpli 0, r3, 0
658	beq 1f /* yes: skip saving 'from' context */
659
660	/* SAVE NON-VOLATILE REGISTERS */
661
662	/* Compute aligned destination pointer (r8 still holds offset
663	* to 'altivec' area in context)
664	*/
665	add r3, r3, r8
666	CACHE_DOWNALGN r3
667
668	PREP_FOR_SAVE r0, r3, r8, r6, r7, r10
669	/* The manual says reading vscr can take some time - do
670	* read it here (into a volatile vector register) while
671	* we wait for cache blocks to be allocated
672	*/
673	mfvscr v0
674	S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11
675	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
676	S_VSCR_VRSAVE r0, v0, r3, r8
677
678	1:
679
680	/* LOAD NON-VOLATILE REGISTERS */
681
682	/* Advance past vrsave/vscr area */
683	addi r4, r4, PPC_CACHE_ALIGNMENT
684	L_VSCR_VRSAVE r4, r0, v0
685	CMP_BASES r4, r8, r6, r7, r10
686	L_V20TOV31 r4, r8, r6, r7, r10, r11
687
688	#ifndef IGNORE_VRSAVE
689	mtcr r9
690	#endif
691	blr
692
693	.global _CPU_Context_initialize_altivec
694	_CPU_Context_initialize_altivec:
695	CMPOFF r8
696	add r3, r3, r8
697	CACHE_DOWNALGN r3
698	lis r8, _CPU_altivec_vrsave_initval@ha
699	lwz r8, _CPU_altivec_vrsave_initval@l(r8)
700	stw r8, VRSAVE_OFF(r3)
701	lis r6, _CPU_altivec_vscr_initval@ha
702	lwz r6, _CPU_altivec_vscr_initval@l(r6)
703	stw r6, VSCR_OFF(r3)
704	blr
705
706	/*
707	* Change the initial value of VRSAVE.
708	* Can be used by initialization code if
709	* it is determined that code was compiled
710	* with -mvrsave=no. In this case, VRSAVE
711	* must be set to all-ones which causes this
712	* support code to save/restore all registers
713	* (only has an effect if IGNORE_VRSAVE is
714	* not defined -- otherwise all registers are
715	* saved/restored anyways).
716	*/
717	.global _CPU_altivec_set_vrsave_initval
718	_CPU_altivec_set_vrsave_initval:
719	lis r8, _CPU_altivec_vrsave_initval@ha
720	stw r3, _CPU_altivec_vrsave_initval@l(r8)
721	mtvrsave r3
722	blr
723
724	#ifdef ALTIVEC_TESTING
725	.global msr_VE_on
726	msr_VE_on:
727	mfmsr r3
728	oris r3, r3, 1<<(31-6-16)
729	mtmsr r3
730	blr
731
732	.global msr_VE_off
733	msr_VE_off:
734	mfmsr r3
735	lis r4, 1<<(31-6-16)
736	andc r3, r3, r4
737	mtmsr r3
738	blr
739
740
741	.global mfvrsave
742	mfvrsave:
743	mfvrsave r3
744	blr
745
746	.global mtvrsave
747	mtvrsave:
748	mtvrsave r3
749	blr
750
751	/* Load all vector registers from memory area.
752	* NOTE: This routine is not strictly ABI compliant --
753	* it guarantees that volatile vector registers
754	* have certain values on exit!
755	*/
756	.global _CPU_altivec_load_all
757	_CPU_altivec_load_all:
758	/* Align address up to next cache-line boundary */
759	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
760	CACHE_DOWNALGN r3
761	#ifndef IGNORE_VRSAVE
762	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
763	* when testing if we really should do the load/store operation.
764	*/
765	mfcr r9
766	#endif
767
768	/* Try to preload 1st line (where vscr and vrsave are stored) */
769	dcbt 0, r3
770	/* Point to start of general vector-register area */
771	addi r3, r3, PPC_CACHE_ALIGNMENT
772	/* Start preloading 2nd line (where first two vectors are) */
773	dcbt 0, r3
774	L_VSCR_VRSAVE r3, r0, v0
775	CMP_BASES r3, r4, r8, r6, r10
776	/* Start preloading 3rd line (where vectors 3 and 4 are) */
777	dcbt 0, r8
778	L_V0TOV31 r3, r4, r8, r6, r10, r11
779
780	#ifndef IGNORE_VRSAVE
781	mtcr r9
782	#endif
783	blr
784
785	.global _CPU_altivec_save_all
786	_CPU_altivec_save_all:
787	/* Align address up to next cache-line boundary */
788	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
789	CACHE_DOWNALGN r3
790
791	#ifndef IGNORE_VRSAVE
792	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
793	* when testing if we really should do the load/store operation.
794	*/
795	mfcr r9
796	#endif
797
798	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
799	/* r0 now contains VRSAVE, r3 still the aligned memory area
800	* and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
801	* respectively. r10 holds zero
802	*/
803	S_V0TOV31 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
804	mfvscr v0
805	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
806	S_VSCR_VRSAVE r0, v0, r3, r11
807
808	#ifndef IGNORE_VRSAVE
809	/* Restore CRC */
810	mtcr r9
811	#endif
812	blr
813
814
815	#if 0
816	.gnu_attribute 4,1
817	.gnu_attribute 8,1
818	#endif
819
820	#endif
821	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format