Context Navigation

vec_sup_asm.S @ 9b4422a2

4.115

Last change on this file since 9b4422a2 was 9b4422a2, checked in by Joel Sherrill <joel.sherrill@…>, on 05/03/12 at 15:09:24

Remove All CVS Id Strings Possible Using a Script

Script does what is expected and tries to do it as
smartly as possible.

+ remove occurrences of two blank comment lines

next to each other after Id string line removed.

+ remove entire comment blocks which only exited to

contain CVS Ids

+ If the processing left a blank line at the top of

a file, it was removed.

Property mode set to 100644

File size: 24.8 KB

Line
1	#ifdef __ALTIVEC__
2
3	/* Altivec support for RTEMS; vector register context management. */
4
5	/*
6	* Authorship
7	* ----------
8	* This software was created by
9	* Till Straumann <strauman@slac.stanford.edu>, 2009,
10	* Stanford Linear Accelerator Center, Stanford University.
11	*
12	* Acknowledgement of sponsorship
13	* ------------------------------
14	* This software was produced by
15	* the Stanford Linear Accelerator Center, Stanford University,
16	* under Contract DE-AC03-76SFO0515 with the Department of Energy.
17	*
18	* Government disclaimer of liability
19	* ----------------------------------
20	* Neither the United States nor the United States Department of Energy,
21	* nor any of their employees, makes any warranty, express or implied, or
22	* assumes any legal liability or responsibility for the accuracy,
23	* completeness, or usefulness of any data, apparatus, product, or process
24	* disclosed, or represents that its use would not infringe privately owned
25	* rights.
26	*
27	* Stanford disclaimer of liability
28	* --------------------------------
29	* Stanford University makes no representations or warranties, express or
30	* implied, nor assumes any liability for the use of this software.
31	*
32	* Stanford disclaimer of copyright
33	* --------------------------------
34	* Stanford University, owner of the copyright, hereby disclaims its
35	* copyright and all other rights in this software. Hence, anyone may
36	* freely use it for any purpose without restriction.
37	*
38	* Maintenance of notices
39	* ----------------------
40	* In the interest of clarity regarding the origin and status of this
41	* SLAC software, this and all the preceding Stanford University notices
42	* are to remain affixed to any copy or derivative of this software made
43	* or distributed by the recipient and are to be affixed to any copy of
44	* software made or distributed by the recipient that contains a copy or
45	* derivative of this software.
46	*
47	* ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48	*/
49
50
51	#include <rtems/powerpc/powerpc.h>
52
53	#ifndef PPC_CACHE_ALIGNMENT
54	#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55	#endif
56
57	#define ALTIVEC_TESTING
58
59	#if PPC_CACHE_ALIGNMENT != 32
60	#error "Altivec support assumes cache-line size is 32 bytes!"
61	#else
62	#undef LD_PPC_CACHE_ALIGNMENT
63	#define LD_PPC_CACHE_ALIGNMENT 5
64	#endif
65
66	.set v0, 0
67	.set v8, 8
68	.set v16, 16
69	.set v20, 20
70	.set v24, 24
71	.set v28, 28
72
73	.set r0, 0
74	.set r3, 3
75	.set r4, 4
76	.set r5, 5
77	.set r6, 6
78	.set r7, 7
79
80	.set r10, 10
81	.set r11, 11
82	.set r12, 12
83
84	.set cr5, 5
85
86	.set VECSIZE, 16
87
88	.set VRSAVE_INIT_VAL, 0
89	.set VSCR_INIT_VAL, 0
90
91	.set VRSAVE_OFF, 16
92	.set VSCR_OFF, 16+12
93
94	.set ds0, 0
95
96	/* Block size for dst -- in units of 16-bytes */
97	.set BSIZE, 2 /* = 32 bytes */
98	.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
99	.set BSTRIDE, 32 /* bytes */
100
101	.data
102
103	.global _CPU_altivec_vrsave_initval
104	_CPU_altivec_vrsave_initval:
105	.long 0
106
107	.global _CPU_altivec_vscr_initval
108	_CPU_altivec_vscr_initval:
109	.long 0
110
111	.text
112
113	.extern _CPU_altivec_psim_cpu
114	.extern _CPU_altivec_ctxt_off
115
116	.macro CMPOFF _B0
117	lis \_B0, _CPU_altivec_ctxt_off@ha
118	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
119	.endm
120
121	/* Conditionally load or store a vector _VR to
122	* EA(_R1\|0 + _R2)
123	* If bit _VR (corresponding to _VR) is set in CRC
124	* then the load/store is performed but otherwise
125	* it is skipped.
126	* If compiled with IGNORE_VRSAVE defined then
127	* the load/store is done unconditionally.
128	*
129	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
130	* _VR : target vector register
131	* _R1 : base register (NOTE: _R1=r0 uses a
132	* implicit ZERO constant, not the contents
133	* of r0) for address computation.
134	* _R2 : 'offset' register for address computation.
135	*
136	* MODIFIES: _VR on output if a load operation is performed.
137	* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
138	* defined.
139	*/
140	.macro LDST _OPCODE, _VR, _R1, _R2
141	#ifndef IGNORE_VRSAVE
142	bc 4, \_VR, 111f
143	#endif
144	\_OPCODE \_VR, \_R1, \_R2
145	111:
146	.endm
147
148	/*
149	* Load or store four 'adjacent' vector registers.
150	*
151	* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
152	* _VR : target vector register
153	* _R1 : base register (NOTE: _R1=r0 uses a
154	* implicit ZERO constant, not the contents
155	* of r0) for address computation.
156	* _B0 : base register 0
157	* _B1 : base register 1
158	* _B2 : base register 2
159	* _B3 : base register 3
160	* _RO : offset register
161	*
162	* memory addresses for _VR, _VR+1, _VR+2, _VR+3
163	* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
164	*
165	* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
166	* operation is performed.
167	* IMPLICIT USE: see LDST
168	*/
169	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
170	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
171	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
172	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
173	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
174	.endm
175
176	/*
177	* Preload/zero two cache lines and save 4 vector registers
178	* to memory.
179	* Note that the cache operation targets memory past the
180	* current storage area which should hopefully hit when
181	* This same code is executed on the next two cache lines...
182	*
183	* This code effectively does
184	* dcbz (_B0 + 64)
185	* dcbz (_B0 + 64 + 32)
186	* stvx _VF+0, (_B0+ 0)
187	* stvx _VF+1, (_B0+16)
188	* stvx _VF+2, (_B0+32)
189	* stvx _VF+3, (_B0+48)
190	*
191	* _LRU: may be 'l' or empty. The former variant should be
192	* used when it is conceivable that the memory area is
193	* unlikely to be used in the near future thus making
194	* it a candidate for early eviction from the caches.
195	*
196	* If it is likely that the memory area is reused soon
197	* (e.g., save/restore across ISR execution) then the
198	* 'stvx' opcode (w/o 'l' suffix) should be used.
199	*
200	* _VR: first of four target vector registers; _VR+0,
201	* _VR+1, _VR+2, _VR+3 are saved.
202	*
203	* _BO: base address of memory area.
204	* _B1: should contain _B0+16 on entry
205	* _B2: should contain _B0+32 on entry
206	* _B3: should contain _B0+48 on entry
207	*
208	* _O1: contains the offset where the four vectors are
209	* stored.
210	* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
211	* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
212	* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
213	* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
214	* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
215	* used to address the two cache-lines past the
216	* current memory area.
217	*
218	* MODIFIES: _O2; contains _O1 + 64 after execution of this
219	* code.
220	*
221	* NOTES: a different set of four vectors can be addressed
222	* simply by changing the one offset register _O1.
223	*
224	* Saving more than 4 registers can simply be
225	* achieved by expanding this macro multiple
226	* times with _O1 and _O2 swapped (new _O1
227	* becomes _O2 = old _O1 + 64) thus stepping
228	* through the memory area.
229	*
230	*/
231	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
232	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
233	dcbz \_B0, \_O2
234	dcbz \_B2, \_O2
235	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
236	.endm
237
238	/*
239	* Save eight vector registers by expanding S4VEC_P twice.
240	* See notes for S4VEC_P above.
241	*
242	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
243	*
244	* MODIFIES: After execution,
245	* _O2 contains original _O1 + 64,
246	* _O1 contains original _O1 + 128
247	*
248	* NOTES: Expanding this macro multiple times lets you save
249	* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
250	*/
251	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
252	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
253	/* Note that the roles of _O1 and _O2 are swapped here */
254	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
255	.endm
256
257	/*
258	* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
259	*
260	* See notes above (for S4VEC_P).
261	*
262	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
263	* MODIFIES: _O1 contains original _O1 + 256
264	* _O2 contains original _O1 + 256 - 64
265	*/
266	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
267	S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
268	S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
270	.endm
271
272	/*
273	* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
274	*
275	* See notes above (for S4VEC_P, S_V0TOV19).
276	*
277	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
278	* MODIFIES: _O1 contains original _O1 + 128
279	* _O2 contains original _O1 + 128 - 64
280	*/
281	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
282	S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
283	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
284	.endm
285
286	/*
287	* Save all registers to memory area
288	*
289	* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
290	* MODIFIES: _O1 contains original _O1 + 512
291	* _O2 contains original _O1 + 512 - 64
292	*/
293	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
294	S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
295	S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296	S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297	S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298	LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
299	.endm
300
301
302	/*
303	* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
304	* We can pass either of them as arguments to another macro which
305	* allows us to decide if the main macro uses dcbt or not when
306	* we expand it...
307	*/
308	.macro DO_DCBT _RA, _RB
309	dcbt \_RA, \_RB
310	.endm
311
312	.macro NO_DCBT _RA, _RB
313	.endm
314
315	/*
316	* NOTE REGARDING dcbt VS dst
317	*
318	* Preloading the cache with memory areas that we soon need
319	* can be done either using 'dcbt' or 'dst' instructions
320	* "ahead of time".
321	* When experimenting (on a mpc7457) I found that the 'dst'
322	* stream instruction was very efficient if there is enough
323	* time to read ahead. It works well when we do a context
324	* switch:
325	*
326	* 1) start DST on new context to be loaded
327	* 2) save old context to memory
328	* 3) load new context from memory
329	*
330	* Because of the interleaved step 2) dst works nicely and
331	* 3) finds what it needs in the cache.
332	*
333	* However, in a situation when there is not much time
334	* to start the DST, e.g., because we want to restore
335	* a context out of the blue (e.g., after returning
336	* from and ISR):
337	*
338	* 1) save volatile registers to memory/stack
339	* 2) execute ISR
340	* 3) might do a task context switch
341	* 4) when returned to old task context then
342	* reload volatile registers from memory/stack.
343	*
344	* In this situation, preloading the target memory before
345	* or after step 1) makes obviously no sense because after
346	* 1) the registers area is most likely in the cache already.
347	*
348	* Starting preload after 2) doesn't make much sense either.
349	* If ISR doesn't lead to a context switch then it is quite
350	* likely that the register area is still in the cache.
351	* OTOTH, if a context switch happens then the preload after 2)
352	* might be useless.
353	*
354	* This leaves us at step 4) where we want to load immediately.
355	* In this case, I found that 'dcbt' works more efficiently
356	* so that's what we use when restoring volatile registers.
357	*
358	* When restoring the non-volatile VRs during a 'normal'
359	* context switch then we shall use DST (and no dcbt).
360	*/
361
362	/*
363	* Symmetric to S4VEC_P above but addresses loading four
364	* vector registers from memory.
365	*
366	* Touches two cache lines past the current memory area
367	* and loads four vectors from the current area.
368	*
369	* Optionally, the DCBT operation may be omitted
370	* (when expanding with _DCBT=NO_DCBT).
371	* This is useful if the cache was already preloaded
372	* by another means (dst instruction).
373	*
374	* NOTE: We always use the 'LRU' form of lvx: lvxl,
375	* because we deem it unlikely that the context
376	* that was just loaded has to be saved again
377	* to memory in the immediate future.
378	*
379	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
380	* as explained above.
381	*
382	* MODIFIES: _O2 contains original _O1 + 64.
383	* _VR.._VR+3 loaded from memory.
384	*/
385	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
386	addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
387	/* preload/touch 2 lines at offset 64 from _B0 */
388	\_DCBT \_B0, \_O2
389	\_DCBT \_B2, \_O2
390	/* load four vectors at off set 0 from _B0 */
391	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
392	.endm
393
394	/*
395	* Symmetric to S8VEC_P; loads 8 vector registers
396	* from memory -- see comments above...
397	*
398	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
399	* as explained above.
400	*
401	* MODIFIES: _O1 contains original _O1 + 128.
402	* _O2 contains original _O1 + 64.
403	* _VR.._VR+7 loaded from memory.
404	*/
405	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
406	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
407	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
408	.endm
409
410	/*
411	* Load volatile vector registers v0..v19 employing
412	* the DCBT to preload the cache. The rationale for
413	* using DCBT here but not when restoring non-volatile
414	* registers is explained above, see
415	*
416	* "NOTE REGARDING dcbt VS dst"
417	*
418	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
419	* as explained above.
420	*
421	* MODIFIES: _O1 contains original _O1 + 256.
422	* _O2 contains original _O1 + 256 - 64.
423	* VR0..VR19 loaded from memory.
424	*/
425	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
426	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
427	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428	LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
429	.endm
430
431	/*
432	* Load non-volatile vector registers v20..v31.
433	* Note that no DCBT is performed since we use
434	* DST for preloading the cache during a context
435	* switch, see
436	*
437	* "NOTE REGARDING dcbt VS dst"
438	*
439	* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
440	* as explained above.
441	*
442	* MODIFIES: _O1 contains original _O1 + 128.
443	* _O2 contains original _O1 + 128 - 64.
444	* VR20..VR31 loaded from memory.
445	*/
446	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
447	L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
448	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
449	.endm
450
451	/*
452	* Load all registers from memory area.
453	*/
454	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
455	L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
456	L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457	L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458	L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459	LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
460	.endm
461
462	/*
463	* Compute
464	* _B1 = _B0 + 16
465	* _B2 = _B0 + 32
466	* _B3 = _B0 + 48
467	* and load
468	* _RO = 0
469	*
470	* convenience macro to be expanded before
471	* any of the load/store macros that use
472	* four base addresses etc.
473	*
474	* INPUT: _B0 = cache-aligned start of memory area
475	*
476	* MODIFIES: _B1, _B2, _B3, _RO as described above.
477	*/
478	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
479	addi \_B1, \_B0, 1*VECSIZE
480	addi \_B2, \_B0, 2*VECSIZE
481	addi \_B3, \_B0, 3*VECSIZE
482	li \_RO, 0
483	.endm
484
485	/*
486	* Prepare for saving general vector registers.
487	*
488	* If not built with #define IGNORE_VRSAVE then
489	*
490	* 1) copy vrsave to CRC
491	*
492	* endif
493	*
494	* 2) copy vrsave to _VRSAVE_REG
495	* 3) preload/zero cache line where vrsave and vscr are stored.
496	* 4) compute base adresses from _B0
497	* 5) preload/zero first two cache lines (remember that the
498	* first S8VEC_P starts preloading/zeroing at offset 64).
499	*
500	* INPUT: 'vrsave' register, _B0 (base address of memory area)
501	* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
502	* _B0 = original _BO + 32
503	* _B1 = original _B0 + 32 + 16,
504	* _B2 = original _B0 + 32 + 32,
505	* _B3 = original _B0 + 32 + 48,
506	* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
507	*/
508	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
509	mfvrsave \_VRSAVE_REG
510	#ifndef IGNORE_VRSAVE
511	mtcr \_VRSAVE_REG
512	#endif
513	dcbz 0, \_B0
514	addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
515	dcbz 0, \_B0
516	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
517	dcbz 0, \_B2
518	.endm
519
520	/*
521	* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
522	* must have been loaded from 'vrsave' and 'vscr', respectively,
523	* prior to expanding this macro.
524	*
525	* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
526	* _VSCR_VREG VR holding 'vscr' contents
527	* _B0 cache-aligned (base) address of memory area.
528	* MODIFIES: _SCRATCH_REG
529	*/
530	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
531	stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
532	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
533	stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
534	.endm
535
536	/*
537	* Load 'vrsave' and 'vscr' from memory.
538	*
539	* INPUTS: _B0 cache-aligned (base) address of memory area.
540	* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
541	* 'vscr', 'vrsave'.
542	* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
543	* with IGNORE_VRSAVE undefined).
544	*/
545	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
546	lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
547	mtvrsave \_SCRATCH_REG
548	#ifndef IGNORE_VRSAVE
549	mtcr \_SCRATCH_REG
550	#endif
551	li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
552	lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
553	mtvscr \_SCRATCH_VREG
554	.endm
555
556	/*
557	* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
558	*
559	* INPUT: _B0
560	* MODIFIES: _B0 (as stated above)
561	*/
562	.macro CACHE_DOWNALGN _B0
563	rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
564	.endm
565
566	.text
567
568	.global _CPU_save_altivec_volatile
569	_CPU_save_altivec_volatile:
570	/* Align address up to next cache-line boundary */
571	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
572	CACHE_DOWNALGN r3
573
574	#ifndef IGNORE_VRSAVE
575	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
576	* when testing if we really should do the load/store operation.
577	*/
578	mfcr r12
579	#endif
580
581	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
582	/* r0 now contains VRSAVE, r3 still the aligned memory area
583	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
584	* respectively. r10 holds zero
585	*/
586	S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
587	mfvscr v0
588	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
589	S_VSCR_VRSAVE r0, v0, r3, r11
590
591	#ifndef IGNORE_VRSAVE
592	/* Restore CRC */
593	mtcr r12
594	#endif
595	blr
596
597	.global _CPU_load_altivec_volatile
598	_CPU_load_altivec_volatile:
599	/* Align address up to next cache-line boundary */
600	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
601	CACHE_DOWNALGN r3
602	#ifndef IGNORE_VRSAVE
603	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
604	* when testing if we really should do the load/store operation.
605	*/
606	mfcr r12
607	#endif
608
609	/* Try to preload 1st line (where vscr and vrsave are stored) */
610	dcbt 0, r3
611	/* Point to start of general vector-register area */
612	addi r3, r3, PPC_CACHE_ALIGNMENT
613	/* Start preloading 2nd line (where first two vectors are) */
614	dcbt 0, r3
615	L_VSCR_VRSAVE r3, r0, v0
616	CMP_BASES r3, r4, r5, r6, r10
617	/* Start preloading 3rd line (where vectors 3 and 4 are) */
618	dcbt 0, r5
619	L_V0TOV19 r3, r4, r5, r6, r10, r11
620
621	#ifndef IGNORE_VRSAVE
622	mtcr r12
623	#endif
624	blr
625
626	.global _CPU_Context_restore_altivec
627	_CPU_Context_restore_altivec:
628	/* Restore is like 'switch' but we don't have
629	* to save an old context.
630	* Move argument to second arg and load NULL pointer
631	* to first one, then jump to 'switch' routine.
632	*/
633	mr r4, r3
634	li r3, 0
635	b _CPU_Context_switch_altivec
636
637	.global _CPU_Context_switch_altivec
638	_CPU_Context_switch_altivec:
639
640	/* fetch offset of altivec area in context */
641	CMPOFF r5
642	/* down-align 'to' area to cache-line boundary */
643	add r4, r4, r5
644	CACHE_DOWNALGN r4
645
646	/* Check for PSIM */
647	lis r6, _CPU_altivec_psim_cpu@ha
648	lwz r6, _CPU_altivec_psim_cpu@l(r6)
649	cmpli 0, r6, 0
650	bne 1f
651	/* Skip data-stream instructions on PSIM (not implemented) */
652	dssall
653	/* Pre-load new context into cache */
654	lis r6, (BSIZE<<(24-16)) \| (BCNT<<(16-16))
655	ori r6, r6, BSTRIDE
656	dstt r4, r6, ds0
657	1:
658
659	#ifndef IGNORE_VRSAVE
660	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
661	* when testing if we really should do the load/store operation.
662	*/
663	mfcr r12
664	#endif
665
666	/* Is 'from' context == NULL ? (then we just do a 'restore') */
667	cmpli 0, r3, 0
668	beq 1f /* yes: skip saving 'from' context */
669
670	/* SAVE NON-VOLATILE REGISTERS */
671
672	/* Compute aligned destination pointer (r5 still holds offset
673	* to 'altivec' area in context)
674	*/
675	add r3, r3, r5
676	CACHE_DOWNALGN r3
677
678	PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
679	/* The manual says reading vscr can take some time - do
680	* read it here (into a volatile vector register) while
681	* we wait for cache blocks to be allocated
682	*/
683	mfvscr v0
684	S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
685	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
686	S_VSCR_VRSAVE r0, v0, r3, r5
687
688	1:
689
690	/* LOAD NON-VOLATILE REGISTERS */
691
692	/* Advance past vrsave/vscr area */
693	addi r4, r4, PPC_CACHE_ALIGNMENT
694	L_VSCR_VRSAVE r4, r0, v0
695	CMP_BASES r4, r5, r6, r7, r10
696	L_V20TOV31 r4, r5, r6, r7, r10, r11
697
698	#ifndef IGNORE_VRSAVE
699	mtcr r12
700	#endif
701	blr
702
703	.global _CPU_Context_initialize_altivec
704	_CPU_Context_initialize_altivec:
705	CMPOFF r5
706	add r3, r3, r5
707	CACHE_DOWNALGN r3
708	lis r5, _CPU_altivec_vrsave_initval@ha
709	lwz r5, _CPU_altivec_vrsave_initval@l(r5)
710	stw r5, VRSAVE_OFF(r3)
711	lis r6, _CPU_altivec_vscr_initval@ha
712	lwz r6, _CPU_altivec_vscr_initval@l(r6)
713	stw r6, VSCR_OFF(r3)
714	blr
715
716	/*
717	* Change the initial value of VRSAVE.
718	* Can be used by initialization code if
719	* it is determined that code was compiled
720	* with -mvrsave=no. In this case, VRSAVE
721	* must be set to all-ones which causes this
722	* support code to save/restore all registers
723	* (only has an effect if IGNORE_VRSAVE is
724	* not defined -- otherwise all registers are
725	* saved/restored anyways).
726	*/
727	.global _CPU_altivec_set_vrsave_initval
728	_CPU_altivec_set_vrsave_initval:
729	lis r5, _CPU_altivec_vrsave_initval@ha
730	stw r3, _CPU_altivec_vrsave_initval@l(r5)
731	mtvrsave r3
732	blr
733
734	#ifdef ALTIVEC_TESTING
735	.global msr_VE_on
736	msr_VE_on:
737	mfmsr r3
738	oris r3, r3, 1<<(31-6-16)
739	mtmsr r3
740	blr
741
742	.global msr_VE_off
743	msr_VE_off:
744	mfmsr r3
745	lis r4, 1<<(31-6-16)
746	andc r3, r3, r4
747	mtmsr r3
748	blr
749
750
751	.global mfvrsave
752	mfvrsave:
753	mfvrsave r3
754	blr
755
756	.global mtvrsave
757	mtvrsave:
758	mtvrsave r3
759	blr
760
761	/* Load all vector registers from memory area.
762	* NOTE: This routine is not strictly ABI compliant --
763	* it guarantees that volatile vector registers
764	* have certain values on exit!
765	*/
766	.global _CPU_altivec_load_all
767	_CPU_altivec_load_all:
768	/* Align address up to next cache-line boundary */
769	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
770	CACHE_DOWNALGN r3
771	#ifndef IGNORE_VRSAVE
772	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
773	* when testing if we really should do the load/store operation.
774	*/
775	mfcr r12
776	#endif
777
778	/* Try to preload 1st line (where vscr and vrsave are stored) */
779	dcbt 0, r3
780	/* Point to start of general vector-register area */
781	addi r3, r3, PPC_CACHE_ALIGNMENT
782	/* Start preloading 2nd line (where first two vectors are) */
783	dcbt 0, r3
784	L_VSCR_VRSAVE r3, r0, v0
785	CMP_BASES r3, r4, r5, r6, r10
786	/* Start preloading 3rd line (where vectors 3 and 4 are) */
787	dcbt 0, r5
788	L_V0TOV31 r3, r4, r5, r6, r10, r11
789
790	#ifndef IGNORE_VRSAVE
791	mtcr r12
792	#endif
793	blr
794
795	.global _CPU_altivec_save_all
796	_CPU_altivec_save_all:
797	/* Align address up to next cache-line boundary */
798	addi r3, r3, PPC_CACHE_ALIGNMENT - 1
799	CACHE_DOWNALGN r3
800
801	#ifndef IGNORE_VRSAVE
802	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
803	* when testing if we really should do the load/store operation.
804	*/
805	mfcr r12
806	#endif
807
808	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
809	/* r0 now contains VRSAVE, r3 still the aligned memory area
810	* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
811	* respectively. r10 holds zero
812	*/
813	S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
814	mfvscr v0
815	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
816	S_VSCR_VRSAVE r0, v0, r3, r11
817
818	#ifndef IGNORE_VRSAVE
819	/* Restore CRC */
820	mtcr r12
821	#endif
822	blr
823
824
825	#if 0
826	.gnu_attribute 4,1
827	.gnu_attribute 8,1
828	#endif
829
830	#endif
831	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format