source: rtems/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @ 5826a1b

4.115
Last change on this file since 5826a1b was 1869bb7, checked in by Sebastian Huber <sebastian.huber@…>, on 05/18/12 at 13:47:23

powerpc: Simplify context switch

PowerPC cores with the SPE (Signal Processing Extension) have 64-bit
general-purpose registers. The SPE context switch code has been merged
with the standard context switch code. The context switch may use cache
operations to increase the performance. It will be ensured that the
context is 32-byte aligned (PPC_DEFAULT_CACHE_LINE_SIZE). This
increases the overall memory size of the context area in the thread
control block slightly. The general-purpose registers GPR2 and GPR13
are no longer part of the context. The BSP must initialize these
registers during startup (usually initialized by the eabi() function).

The new BSP option BSP_USE_DATA_CACHE_BLOCK_TOUCH can be used to enable
the dcbt instruction in the context switch.

The new BSP option BSP_USE_SYNC_IN_CONTEXT_SWITCH can be used to enable
sync and isync instructions in the context switch. This should be not
necessary in most cases.

  • Property mode set to 100644
File size: 24.5 KB
Line 
1#ifdef __ALTIVEC__
2
3/* Altivec support for RTEMS; vector register context management.  */
4
5/*
6 * Authorship
7 * ----------
8 * This software was created by
9 *     Till Straumann <strauman@slac.stanford.edu>, 2009,
10 *         Stanford Linear Accelerator Center, Stanford University.
11 *
12 * Acknowledgement of sponsorship
13 * ------------------------------
14 * This software was produced by
15 *     the Stanford Linear Accelerator Center, Stanford University,
16 *         under Contract DE-AC03-76SFO0515 with the Department of Energy.
17 *
18 * Government disclaimer of liability
19 * ----------------------------------
20 * Neither the United States nor the United States Department of Energy,
21 * nor any of their employees, makes any warranty, express or implied, or
22 * assumes any legal liability or responsibility for the accuracy,
23 * completeness, or usefulness of any data, apparatus, product, or process
24 * disclosed, or represents that its use would not infringe privately owned
25 * rights.
26 *
27 * Stanford disclaimer of liability
28 * --------------------------------
29 * Stanford University makes no representations or warranties, express or
30 * implied, nor assumes any liability for the use of this software.
31 *
32 * Stanford disclaimer of copyright
33 * --------------------------------
34 * Stanford University, owner of the copyright, hereby disclaims its
35 * copyright and all other rights in this software.  Hence, anyone may
36 * freely use it for any purpose without restriction.
37 *
38 * Maintenance of notices
39 * ----------------------
40 * In the interest of clarity regarding the origin and status of this
41 * SLAC software, this and all the preceding Stanford University notices
42 * are to remain affixed to any copy or derivative of this software made
43 * or distributed by the recipient and are to be affixed to any copy of
44 * software made or distributed by the recipient that contains a copy or
45 * derivative of this software.
46 *
47 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48 */
49
50
51#include <rtems/powerpc/powerpc.h>
52
53#ifndef PPC_CACHE_ALIGNMENT
54#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55#endif
56
57#define ALTIVEC_TESTING
58
59#if PPC_CACHE_ALIGNMENT != 32
60#error "Altivec support assumes cache-line size is 32 bytes!"
61#else
62#undef  LD_PPC_CACHE_ALIGNMENT
63#define LD_PPC_CACHE_ALIGNMENT 5
64#endif
65
66        .set   v0,   0
67        .set   v8,   8
68        .set   v16, 16
69        .set   v20, 20
70        .set   v24, 24
71        .set   v28, 28
72
73        .set   r0,   0
74        .set   r3,   3
75        .set   r4,   4
76        .set   r5,   5
77        .set   r6,   6
78        .set   r7,   7
79
80        .set   r10, 10
81        .set   r11, 11
82        .set   r12, 12
83
84        .set   cr5,  5
85
86        .set   VECSIZE,    16
87
88        .set   VRSAVE_INIT_VAL, 0
89        .set   VSCR_INIT_VAL,   0
90
91        .set   VRSAVE_OFF, 16
92        .set   VSCR_OFF,   16+12
93
94        .set   ds0,  0
95
96        /* Block size for dst -- in units of 16-bytes */
97        .set   BSIZE,   2       /* = 32 bytes */
98        .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
99        .set   BSTRIDE, 32      /*      bytes */
100
101        .data
102
103        .global _CPU_altivec_vrsave_initval
104_CPU_altivec_vrsave_initval:
105        .long   0
106
107        .global _CPU_altivec_vscr_initval
108_CPU_altivec_vscr_initval:
109        .long   0
110
111        .text
112
113        .extern _CPU_altivec_psim_cpu
114        .extern _CPU_altivec_ctxt_off
115
116        .macro  CMPOFF _B0
117        lis     \_B0, _CPU_altivec_ctxt_off@ha
118        lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
119        .endm
120
121        /* Conditionally load or store a vector _VR to
122     *  EA(_R1|0 + _R2)
123         * If bit _VR (corresponding to _VR) is set in CRC
124         * then the load/store is performed but otherwise
125         * it is skipped.
126         * If compiled with IGNORE_VRSAVE defined then
127         * the load/store is done unconditionally.
128         *
129         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
130         * _VR    : target vector register
131         * _R1    : base register (NOTE: _R1=r0 uses a
132         *          implicit ZERO constant, not the contents
133         *          of r0) for address computation.
134         * _R2    : 'offset' register for address computation.
135         *
136         * MODIFIES:      _VR on output if a load operation is performed.
137         * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
138         *                defined.
139         */
140        .macro LDST _OPCODE, _VR, _R1, _R2
141#ifndef IGNORE_VRSAVE
142        bc       4, \_VR, 111f
143#endif
144        \_OPCODE \_VR, \_R1, \_R2
145111:
146        .endm
147
148        /*
149         * Load or store four 'adjacent' vector registers.
150         *
151         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
152         * _VR    : target vector register
153         * _R1    : base register (NOTE: _R1=r0 uses a
154         *          implicit ZERO constant, not the contents
155         *          of r0) for address computation.
156         * _B0    : base register 0
157         * _B1    : base register 1
158         * _B2    : base register 2
159         * _B3    : base register 3
160         * _RO    : offset register
161         *
162         * memory addresses for _VR, _VR+1, _VR+2, _VR+3
163         * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
164         *
165         * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
166         *                operation is performed.
167         * IMPLICIT USE:  see LDST
168         */
169        .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
170        LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
171        LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
172        LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
173        LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
174        .endm
175
176        /*
177         * Preload/zero two cache lines and save 4 vector registers
178         * to memory.
179         * Note that the cache operation targets memory *past* the
180         * current storage area which should hopefully hit when
181         * This same code is executed on the next two cache lines...
182         *
183         * This code effectively does
184         *   dcbz (_B0 + 64)
185         *   dcbz (_B0 + 64 + 32)
186         *   stvx _VF+0, (_B0+ 0)
187         *   stvx _VF+1, (_B0+16)
188         *   stvx _VF+2, (_B0+32)
189         *   stvx _VF+3, (_B0+48)
190         *
191         * _LRU:  may be 'l' or empty. The former variant should be
192         *        used when it is conceivable that the memory area is
193         *        unlikely to be used in the near future thus making
194         *        it a candidate for early eviction from the caches.
195         *
196         *        If it is likely that the memory area is reused soon
197         *        (e.g., save/restore across ISR execution) then the
198         *        'stvx' opcode (w/o 'l' suffix) should be used.
199         *
200         * _VR:   first of four target vector registers; _VR+0,
201         *        _VR+1, _VR+2, _VR+3 are saved.
202         *
203         * _BO:   base address of memory area.
204         * _B1:   should contain _B0+16 on entry
205         * _B2:   should contain _B0+32 on entry
206         * _B3:   should contain _B0+48 on entry
207         *
208         * _O1:   contains the offset where the four vectors are
209         *        stored.
210         *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
211         *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
212         *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
213         *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
214         * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
215     *        used to address the two cache-lines past the
216         *        current memory area.
217         *
218         * MODIFIES: _O2; contains _O1 + 64 after execution of this
219         *        code.
220         *
221         * NOTES: a different set of four vectors can be addressed
222         *        simply by changing the one offset register _O1.
223         *
224         *        Saving more than 4 registers can simply be
225         *        achieved by expanding this macro multiple
226         *        times with _O1 and _O2 swapped (new _O1
227         *        becomes _O2 = old _O1 + 64) thus stepping
228         *        through the memory area.
229         *
230         */
231        .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
232        addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
233        dcbz  \_B0, \_O2
234        dcbz  \_B2, \_O2
235        LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
236        .endm
237
238        /*
239         * Save eight vector registers by expanding S4VEC_P twice.
240         * See notes for S4VEC_P above.
241         *
242         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
243         *
244         * MODIFIES: After execution,
245         *           _O2 contains original _O1 +  64,
246         *           _O1 contains original _O1 + 128
247         *
248         * NOTES:    Expanding this macro multiple times lets you save
249         *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
250         */
251        .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
252        S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
253        /* Note that the roles of _O1 and _O2 are swapped here */
254        S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
255        .endm
256
257        /*
258         * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
259         *
260         * See notes above (for S4VEC_P).
261         *
262         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
263         * MODIFIES: _O1 contains original _O1 + 256
264         *           _O2 contains original _O1 + 256 - 64
265         */
266        .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
267        S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
268        S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269        LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
270        .endm
271
272        /*
273         * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
274         *
275         * See notes above (for S4VEC_P, S_V0TOV19).
276         *
277         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
278         * MODIFIES: _O1 contains original _O1 + 128
279         *           _O2 contains original _O1 + 128 - 64
280         */
281        .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
282        S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
283        LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
284        .endm
285
286        /*
287         * Save all registers to memory area
288         *
289         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
290         * MODIFIES: _O1 contains original _O1 + 512
291         *           _O2 contains original _O1 + 512 - 64
292         */
293        .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
294        S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
295        S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296        S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297        S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298        LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
299        .endm
300
301
302        /*
303         * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
304         * We can pass either of them as arguments to another macro which
305         * allows us to decide if the main macro uses dcbt or not when
306         * we expand it...
307         */
308        .macro DO_DCBT _RA, _RB
309        dcbt \_RA, \_RB
310        .endm
311
312        .macro NO_DCBT _RA, _RB
313        .endm
314
315        /*
316         * NOTE REGARDING dcbt VS dst
317         *
318         * Preloading the cache with memory areas that we soon need
319         * can be done either using 'dcbt' or 'dst' instructions
320         * "ahead of time".
321         * When experimenting (on a mpc7457) I found that the 'dst'
322         * stream instruction was very efficient if there is enough
323         * time to read ahead. It works well when we do a context
324         * switch:
325         *
326         *   1) start DST on new context to be loaded
327         *   2) save old context to memory
328         *   3) load new context from memory
329         *
330         * Because of the interleaved step 2) dst works nicely and
331         * 3) finds what it needs in the cache.
332         *
333         * However, in a situation when there is not much time
334         * to start the DST, e.g., because we want to restore
335         * a context out of the blue (e.g., after returning
336         * from and ISR):
337         *
338         *   1) save volatile registers to memory/stack
339         *   2) execute ISR
340         *   3) might do a task context switch
341         *   4) when returned to old task context then
342         *      reload volatile registers from memory/stack.
343         *
344         * In this situation, preloading the target memory before
345         * or after step 1) makes obviously no sense because after
346         * 1) the registers area is most likely in the cache already.
347         *
348         * Starting preload after 2) doesn't make much sense either.
349         * If ISR doesn't lead to a context switch then it is quite
350         * likely that the register area is still in the cache.
351         * OTOTH, if a context switch happens then the preload after 2)
352         * might be useless.
353         *
354         * This leaves us at step 4) where we want to load immediately.
355         * In this case, I found that 'dcbt' works more efficiently
356         * so that's what we use when restoring volatile registers.
357         *
358         * When restoring the non-volatile VRs during a 'normal'
359         * context switch then we shall use DST (and no dcbt).
360         */
361
362        /*
363         * Symmetric to S4VEC_P above but addresses loading four
364         * vector registers from memory.
365         *
366         * Touches two cache lines past the current memory area
367         * and loads four vectors from the current area.
368         *
369         * Optionally, the DCBT operation may be omitted
370         * (when expanding with _DCBT=NO_DCBT).
371         * This is useful if the cache was already preloaded
372         * by another means (dst instruction).
373         *
374         * NOTE: We always use the 'LRU' form of lvx: lvxl,
375         *       because we deem it unlikely that the context
376         *       that was just loaded has to be saved again
377         *       to memory in the immediate future.
378         *
379         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
380         *           as explained above.
381         *
382         * MODIFIES: _O2 contains original _O1 + 64.
383         *           _VR.._VR+3 loaded from memory.
384         */
385        .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
386        addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
387        /* preload/touch 2 lines at offset 64 from _B0 */
388        \_DCBT   \_B0, \_O2
389        \_DCBT   \_B2, \_O2
390        /* load four vectors at off set 0 from _B0     */
391        LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
392        .endm
393
394        /*
395         * Symmetric to S8VEC_P; loads 8 vector registers
396         * from memory -- see comments above...
397         *
398         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
399         *           as explained above.
400         *
401         * MODIFIES: _O1 contains original _O1 + 128.
402         *           _O2 contains original _O1 +  64.
403         *           _VR.._VR+7 loaded from memory.
404         */
405        .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
406        L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
407        L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
408        .endm
409       
410        /*
411         * Load volatile vector registers v0..v19 employing
412         * the DCBT to preload the cache. The rationale for
413         * using DCBT here but not when restoring non-volatile
414         * registers is explained above, see
415         *
416         *    "NOTE REGARDING dcbt VS dst"
417         *
418         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
419         *           as explained above.
420         *
421         * MODIFIES: _O1 contains original _O1 + 256.
422         *           _O2 contains original _O1 + 256 - 64.
423         *           VR0..VR19 loaded from memory.
424         */
425        .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
426        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
427        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428        LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
429        .endm
430
431        /*
432         * Load non-volatile vector registers v20..v31.
433         * Note that no DCBT is performed since we use
434         * DST for preloading the cache during a context
435         * switch, see
436         *
437         *    "NOTE REGARDING dcbt VS dst"
438         *
439         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
440         *           as explained above.
441         *
442         * MODIFIES: _O1 contains original _O1 + 128.
443         *           _O2 contains original _O1 + 128 - 64.
444         *           VR20..VR31 loaded from memory.
445         */
446        .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
447        L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
448        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
449        .endm
450
451        /*
452         * Load all registers from memory area.
453         */
454        .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
455        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
456        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457        L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458        L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
460        .endm
461
462        /*
463         * Compute
464         *     _B1 = _B0 + 16
465         *     _B2 = _B0 + 32
466         *     _B3 = _B0 + 48
467         * and load
468         *     _RO = 0
469         *
470         * convenience macro to be expanded before
471         * any of the load/store macros that use
472         * four base addresses etc.
473         *
474         * INPUT: _B0 = cache-aligned start of memory area
475         *
476         * MODIFIES: _B1, _B2, _B3, _RO as described above.
477         */
478        .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
479        addi       \_B1, \_B0, 1*VECSIZE
480        addi       \_B2, \_B0, 2*VECSIZE
481        addi       \_B3, \_B0, 3*VECSIZE
482        li         \_RO, 0
483        .endm
484
485        /*
486         * Prepare for saving general vector registers.
487         *
488         * If not built with #define IGNORE_VRSAVE then
489         *
490         *  1) copy vrsave to CRC
491         *
492         * endif
493         *
494         *  2) copy vrsave to _VRSAVE_REG
495         *  3) preload/zero cache line where vrsave and vscr are stored.
496         *  4) compute base adresses from _B0
497         *  5) preload/zero first two cache lines (remember that the
498         *     first S8VEC_P starts preloading/zeroing at offset 64).
499         *
500         * INPUT:    'vrsave' register, _B0 (base address of memory area)
501         * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
502         *           _B0 = original _BO + 32
503         *           _B1 = original _B0 + 32 + 16,
504         *           _B2 = original _B0 + 32 + 32,
505         *           _B3 = original _B0 + 32 + 48,
506         *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
507         */
508        .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
509        mfvrsave   \_VRSAVE_REG
510#ifndef IGNORE_VRSAVE
511        mtcr       \_VRSAVE_REG
512#endif
513        dcbz       0, \_B0
514        addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
515        dcbz       0, \_B0
516        CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
517        dcbz       0, \_B2
518        .endm
519
520        /*
521         * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
522         * must have been loaded from 'vrsave' and 'vscr', respectively,
523         * prior to expanding this macro.
524         *
525         * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
526         *           _VSCR_VREG  VR  holding 'vscr'   contents
527         *           _B0 cache-aligned (base) address of memory area.
528         * MODIFIES: _SCRATCH_REG
529         */
530        .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
531        stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
532        li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
533        stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
534        .endm
535
536        /*
537         * Load 'vrsave' and 'vscr' from memory.
538         *
539         * INPUTS:   _B0 cache-aligned (base) address of memory area.
540         * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
541         *           'vscr', 'vrsave'.
542         *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
543         *           with IGNORE_VRSAVE undefined).
544         */
545        .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
546        lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
547        mtvrsave  \_SCRATCH_REG
548#ifndef IGNORE_VRSAVE
549        mtcr      \_SCRATCH_REG
550#endif
551        li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
552        lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
553        mtvscr    \_SCRATCH_VREG
554        .endm
555
556        /*
557         * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
558         *
559         * INPUT:    _B0
560         * MODIFIES: _B0 (as stated above)
561         */
562        .macro CACHE_DOWNALGN _B0
563        rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
564        .endm
565
566        .text
567
568        .global _CPU_save_altivec_volatile
569_CPU_save_altivec_volatile:
570        /* Align address up to next cache-line boundary */
571        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
572        CACHE_DOWNALGN r3
573
574#ifndef IGNORE_VRSAVE
575        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
576         * when testing if we really should do the load/store operation.
577         */
578        mfcr      r12
579#endif
580
581        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
582        /* r0 now contains VRSAVE, r3 still the aligned memory area
583         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
584         * respectively. r10 holds zero
585         */
586        S_V0TOV19     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
587        mfvscr        v0
588        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
589        S_VSCR_VRSAVE r0, v0, r3, r11
590
591#ifndef IGNORE_VRSAVE
592        /* Restore CRC */
593        mtcr      r12
594#endif
595        blr
596
597        .global _CPU_load_altivec_volatile
598_CPU_load_altivec_volatile:
599        /* Align address up to next cache-line boundary */
600        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
601        CACHE_DOWNALGN r3
602#ifndef IGNORE_VRSAVE
603        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
604         * when testing if we really should do the load/store operation.
605         */
606        mfcr      r12
607#endif
608
609        /* Try to preload 1st line (where vscr and vrsave are stored) */
610        dcbt      0, r3
611        /* Point to start of general vector-register area             */
612        addi      r3, r3, PPC_CACHE_ALIGNMENT
613        /* Start preloading 2nd line (where first two vectors are)    */
614        dcbt      0, r3
615        L_VSCR_VRSAVE r3, r0, v0
616        CMP_BASES     r3, r4, r5, r6, r10
617        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
618        dcbt      0, r5
619        L_V0TOV19 r3, r4, r5, r6, r10, r11
620
621#ifndef IGNORE_VRSAVE
622        mtcr      r12
623#endif
624        blr
625
626        .global _CPU_Context_switch_altivec
627_CPU_Context_switch_altivec:
628
629        /* fetch offset of altivec area in context                   */
630        CMPOFF    r5
631        /* down-align 'to' area to cache-line boundary               */
632        add       r4, r4, r5
633        CACHE_DOWNALGN r4
634
635        /* Check for PSIM                                            */
636        lis       r6, _CPU_altivec_psim_cpu@ha
637        lwz       r6, _CPU_altivec_psim_cpu@l(r6)
638        cmpli     0, r6, 0
639        bne       1f
640        /* Skip data-stream instructions on PSIM (not implemented)   */
641        dssall
642        /* Pre-load new context into cache                           */
643        lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
644        ori       r6, r6, BSTRIDE
645        dstt      r4, r6, ds0
6461:
647
648#ifndef IGNORE_VRSAVE
649        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
650         * when testing if we really should do the load/store operation.
651         */
652        mfcr      r12
653#endif
654
655        /* Is 'from' context == NULL ? (then we just do a 'restore') */
656        cmpli     0, r3, 0
657        beq       1f           /* yes: skip saving 'from' context    */
658
659        /* SAVE NON-VOLATILE REGISTERS                               */
660
661        /* Compute aligned destination pointer (r5 still holds offset
662         * to 'altivec' area in context)
663         */
664        add       r3, r3, r5
665        CACHE_DOWNALGN r3
666
667        PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
668        /* The manual says reading vscr can take some time - do
669         * read it here (into a volatile vector register) while
670         * we wait for cache blocks to be allocated
671         */
672        mfvscr    v0
673        S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
674        /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
675        S_VSCR_VRSAVE r0, v0, r3, r5
676
6771:
678
679        /* LOAD NON-VOLATILE REGISTERS                               */
680
681        /* Advance past vrsave/vscr area                             */
682        addi      r4, r4, PPC_CACHE_ALIGNMENT
683        L_VSCR_VRSAVE r4, r0, v0
684        CMP_BASES r4, r5, r6, r7, r10
685        L_V20TOV31 r4, r5, r6, r7, r10, r11
686
687#ifndef IGNORE_VRSAVE
688        mtcr      r12
689#endif
690        blr
691
692        .global _CPU_Context_initialize_altivec
693_CPU_Context_initialize_altivec:
694        CMPOFF    r5
695        add       r3, r3, r5
696        CACHE_DOWNALGN r3
697        lis       r5, _CPU_altivec_vrsave_initval@ha
698        lwz       r5, _CPU_altivec_vrsave_initval@l(r5)
699        stw       r5, VRSAVE_OFF(r3)
700        lis       r6, _CPU_altivec_vscr_initval@ha
701        lwz       r6, _CPU_altivec_vscr_initval@l(r6)
702        stw       r6, VSCR_OFF(r3)
703        blr
704
705        /*
706         * Change the initial value of VRSAVE.
707         * Can be used by initialization code if
708         * it is determined that code was compiled
709         * with -mvrsave=no. In this case, VRSAVE
710         * must be set to all-ones which causes this
711         * support code to save/restore *all* registers
712         * (only has an effect if IGNORE_VRSAVE is
713         * not defined -- otherwise all registers are
714         * saved/restored anyways).
715         */
716        .global _CPU_altivec_set_vrsave_initval
717_CPU_altivec_set_vrsave_initval:
718        lis       r5, _CPU_altivec_vrsave_initval@ha
719        stw       r3, _CPU_altivec_vrsave_initval@l(r5)
720        mtvrsave  r3
721        blr
722
723#ifdef ALTIVEC_TESTING
724        .global msr_VE_on
725msr_VE_on:
726        mfmsr r3
727        oris  r3, r3, 1<<(31-6-16)
728        mtmsr r3
729        blr
730
731        .global msr_VE_off
732msr_VE_off:
733        mfmsr r3
734        lis   r4,  1<<(31-6-16)
735        andc  r3, r3, r4
736        mtmsr r3
737        blr
738
739
740        .global mfvrsave
741mfvrsave:
742        mfvrsave r3
743        blr
744
745        .global mtvrsave
746mtvrsave:
747        mtvrsave r3
748        blr
749
750        /* Load all vector registers from memory area.
751         * NOTE: This routine is not strictly ABI compliant --
752         *       it guarantees that volatile vector registers
753         *       have certain values on exit!
754         */
755        .global _CPU_altivec_load_all
756_CPU_altivec_load_all:
757        /* Align address up to next cache-line boundary */
758        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
759        CACHE_DOWNALGN r3
760#ifndef IGNORE_VRSAVE
761        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
762         * when testing if we really should do the load/store operation.
763         */
764        mfcr      r12
765#endif
766
767        /* Try to preload 1st line (where vscr and vrsave are stored) */
768        dcbt      0, r3
769        /* Point to start of general vector-register area             */
770        addi      r3, r3, PPC_CACHE_ALIGNMENT
771        /* Start preloading 2nd line (where first two vectors are)    */
772        dcbt      0, r3
773        L_VSCR_VRSAVE r3, r0, v0
774        CMP_BASES     r3, r4, r5, r6, r10
775        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
776        dcbt      0, r5
777        L_V0TOV31 r3, r4, r5, r6, r10, r11
778
779#ifndef IGNORE_VRSAVE
780        mtcr      r12
781#endif
782        blr
783
784        .global _CPU_altivec_save_all
785_CPU_altivec_save_all:
786        /* Align address up to next cache-line boundary */
787        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
788        CACHE_DOWNALGN r3
789
790#ifndef IGNORE_VRSAVE
791        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
792         * when testing if we really should do the load/store operation.
793         */
794        mfcr      r12
795#endif
796
797        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
798        /* r0 now contains VRSAVE, r3 still the aligned memory area
799         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
800         * respectively. r10 holds zero
801         */
802        S_V0TOV31     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
803        mfvscr        v0
804        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
805        S_VSCR_VRSAVE r0, v0, r3, r11
806
807#ifndef IGNORE_VRSAVE
808        /* Restore CRC */
809        mtcr      r12
810#endif
811        blr
812
813
814#if 0
815        .gnu_attribute 4,1
816        .gnu_attribute 8,1
817#endif
818
819#endif
820#endif
Note: See TracBrowser for help on using the repository browser.