source: rtems/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @ fbee4ff

4.104.115
Last change on this file since fbee4ff was fbee4ff, checked in by Till Straumann <strauman@…>, on 12/02/09 at 01:33:51

2009-12-01 Till Straumann <strauman@…>

  • Makefile.am, mpc6xx/altivec: new directory implementing support for AltiVec? context saving/restoring.
  • Property mode set to 100644
File size: 22.9 KB
Line 
1#ifdef __ALTIVEC__
2
3#include <rtems/powerpc/powerpc.h>
4
5#ifndef PPC_CACHE_ALIGNMENT
6#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
7#endif
8
9#define ALTIVEC_TESTING
10
11#if PPC_CACHE_ALIGNMENT != 32
12#error "Altivec support assumes cache-line size is 32 bytes!"
13#else
14#undef  LD_PPC_CACHE_ALIGNMENT
15#define LD_PPC_CACHE_ALIGNMENT 5
16#endif
17
18        .set   v0,   0
19        .set   v8,   8
20        .set   v16, 16
21        .set   v20, 20
22        .set   v24, 24
23        .set   v28, 28
24
25        .set   r0,   0
26        .set   r3,   3
27        .set   r4,   4
28        .set   r5,   5
29        .set   r6,   6
30        .set   r7,   7
31
32        .set   r10, 10
33        .set   r11, 11
34        .set   r12, 12
35
36        .set   cr5,  5
37
38        .set   VECSIZE,    16
39
40        .set   VRSAVE_INIT_VAL, 0
41        .set   VSCR_INIT_VAL,   0
42
43        .set   VRSAVE_OFF, 16
44        .set   VSCR_OFF,   16+12
45
46        .set   ds0,  0
47
48        /* Block size for dst -- in units of 16-bytes */
49        .set   BSIZE,   2       /* = 32 bytes */
50        .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
51        .set   BSTRIDE, 32      /*      bytes */
52
53        .data
54
55        .global _CPU_altivec_vrsave_initval
56_CPU_altivec_vrsave_initval:
57        .long   0
58
59        .global _CPU_altivec_vscr_initval
60_CPU_altivec_vscr_initval:
61        .long   0
62
63        .text
64
65        .extern _CPU_altivec_psim_cpu
66        .extern _CPU_altivec_ctxt_off
67
68        .macro  CMPOFF _B0
69        lis     \_B0, _CPU_altivec_ctxt_off@ha
70        lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
71        .endm
72
73        /* Conditionally load or store a vector _VR to
74     *  EA(_R1|0 + _R2)
75         * If bit _VR (corresponding to _VR) is set in CRC
76         * then the load/store is performed but otherwise
77         * it is skipped.
78         * If compiled with IGNORE_VRSAVE defined then
79         * the load/store is done unconditionally.
80         *
81         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
82         * _VR    : target vector register
83         * _R1    : base register (NOTE: _R1=r0 uses a
84         *          implicit ZERO constant, not the contents
85         *          of r0) for address computation.
86         * _R2    : 'offset' register for address computation.
87         *
88         * MODIFIES:      _VR on output if a load operation is performed.
89         * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
90         *                defined.
91         */
92        .macro LDST _OPCODE, _VR, _R1, _R2
93#ifndef IGNORE_VRSAVE
94        bc       4, \_VR, 111f
95#endif
96        \_OPCODE \_VR, \_R1, \_R2
97111:
98        .endm
99
100        /*
101         * Load or store four 'adjacent' vector registers.
102         *
103         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
104         * _VR    : target vector register
105         * _R1    : base register (NOTE: _R1=r0 uses a
106         *          implicit ZERO constant, not the contents
107         *          of r0) for address computation.
108         * _B0    : base register 0
109         * _B1    : base register 1
110         * _B2    : base register 2
111         * _B3    : base register 3
112         * _RO    : offset register
113         *
114         * memory addresses for _VR, _VR+1, _VR+2, _VR+3
115         * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
116         *
117         * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
118         *                operation is performed.
119         * IMPLICIT USE:  see LDST
120         */
121        .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
122        LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
123        LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
124        LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
125        LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
126        .endm
127
128        /*
129         * Preload/zero two cache lines and save 4 vector registers
130         * to memory.
131         * Note that the cache operation targets memory *past* the
132         * current storage area which should hopefully hit when
133         * This same code is executed on the next two cache lines...
134         *
135         * This code effectively does
136         *   dcbz (_B0 + 64)
137         *   dcbz (_B0 + 64 + 32)
138         *   stvx _VF+0, (_B0+ 0)
139         *   stvx _VF+1, (_B0+16)
140         *   stvx _VF+2, (_B0+32)
141         *   stvx _VF+3, (_B0+48)
142         *
143         * _LRU:  may be 'l' or empty. The former variant should be
144         *        used when it is conceivable that the memory area is
145         *        unlikely to be used in the near future thus making
146         *        it a candidate for early eviction from the caches.
147         *
148         *        If it is likely that the memory area is reused soon
149         *        (e.g., save/restore across ISR execution) then the
150         *        'stvx' opcode (w/o 'l' suffix) should be used.
151         *
152         * _VR:   first of four target vector registers; _VR+0,
153         *        _VR+1, _VR+2, _VR+3 are saved.
154         *
155         * _BO:   base address of memory area.
156         * _B1:   should contain _B0+16 on entry
157         * _B2:   should contain _B0+32 on entry
158         * _B3:   should contain _B0+48 on entry
159         *
160         * _O1:   contains the offset where the four vectors are
161         *        stored.
162         *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
163         *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
164         *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
165         *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
166         * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
167     *        used to address the two cache-lines past the
168         *        current memory area.
169         *
170         * MODIFIES: _O2; contains _O1 + 64 after execution of this
171         *        code.
172         *
173         * NOTES: a different set of four vectors can be addressed
174         *        simply by changing the one offset register _O1.
175         *
176         *        Saving more than 4 registers can simply be
177         *        achieved by expanding this macro multiple
178         *        times with _O1 and _O2 swapped (new _O1
179         *        becomes _O2 = old _O1 + 64) thus stepping
180         *        through the memory area.
181         *
182         */
183        .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
184        addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
185        dcbz  \_B0, \_O2
186        dcbz  \_B2, \_O2
187        LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
188        .endm
189
190        /*
191         * Save eight vector registers by expanding S4VEC_P twice.
192         * See notes for S4VEC_P above.
193         *
194         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
195         *
196         * MODIFIES: After execution,
197         *           _O2 contains original _O1 +  64,
198         *           _O1 contains original _O1 + 128
199         *
200         * NOTES:    Expanding this macro multiple times lets you save
201         *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
202         */
203        .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
204        S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
205        /* Note that the roles of _O1 and _O2 are swapped here */
206        S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
207        .endm
208
209        /*
210         * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
211         *
212         * See notes above (for S4VEC_P).
213         *
214         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
215         * MODIFIES: _O1 contains original _O1 + 256
216         *           _O2 contains original _O1 + 256 - 64
217         */
218        .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
219        S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
220        S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
221        LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
222        .endm
223
224        /*
225         * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
226         *
227         * See notes above (for S4VEC_P, S_V0TOV19).
228         *
229         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
230         * MODIFIES: _O1 contains original _O1 + 128
231         *           _O2 contains original _O1 + 128 - 64
232         */
233        .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
234        S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
235        LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
236        .endm
237
238        /*
239         * Save all registers to memory area
240         *
241         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
242         * MODIFIES: _O1 contains original _O1 + 512
243         *           _O2 contains original _O1 + 512 - 64
244         */
245        .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
246        S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
247        S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
248        S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
249        S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
250        LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
251        .endm
252
253
254        /*
255         * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
256         * We can pass either of them as arguments to another macro which
257         * allows us to decide if the main macro uses dcbt or not when
258         * we expand it...
259         */
260        .macro DO_DCBT _RA, _RB
261        dcbt \_RA, \_RB
262        .endm
263
264        .macro NO_DCBT _RA, _RB
265        .endm
266
267        /*
268         * NOTE REGARDING dcbt VS dst
269         *
270         * Preloading the cache with memory areas that we soon need
271         * can be done either using 'dcbt' or 'dst' instructions
272         * "ahead of time".
273         * When experimenting (on a mpc7457) I found that the 'dst'
274         * stream instruction was very efficient if there is enough
275         * time to read ahead. It works well when we do a context
276         * switch:
277         *
278         *   1) start DST on new context to be loaded
279         *   2) save old context to memory
280         *   3) load new context from memory
281         *
282         * Because of the interleaved step 2) dst works nicely and
283         * 3) finds what it needs in the cache.
284         *
285         * However, in a situation when there is not much time
286         * to start the DST, e.g., because we want to restore
287         * a context out of the blue (e.g., after returning
288         * from and ISR):
289         *
290         *   1) save volatile registers to memory/stack
291         *   2) execute ISR
292         *   3) might do a task context switch
293         *   4) when returned to old task context then
294         *      reload volatile registers from memory/stack.
295         *
296         * In this situation, preloading the target memory before
297         * or after step 1) makes obviously no sense because after
298         * 1) the registers area is most likely in the cache already.
299         *
300         * Starting preload after 2) doesn't make much sense either.
301         * If ISR doesn't lead to a context switch then it is quite
302         * likely that the register area is still in the cache.
303         * OTOTH, if a context switch happens then the preload after 2)
304         * might be useless.
305         *
306         * This leaves us at step 4) where we want to load immediately.
307         * In this case, I found that 'dcbt' works more efficiently
308         * so that's what we use when restoring volatile registers.
309         *
310         * When restoring the non-volatile VRs during a 'normal'
311         * context switch then we shall use DST (and no dcbt).
312         */
313
314        /*
315         * Symmetric to S4VEC_P above but addresses loading four
316         * vector registers from memory.
317         *
318         * Touches two cache lines past the current memory area
319         * and loads four vectors from the current area.
320         *
321         * Optionally, the DCBT operation may be omitted
322         * (when expanding with _DCBT=NO_DCBT).
323         * This is useful if the cache was already preloaded
324         * by another means (dst instruction).
325         *
326         * NOTE: We always use the 'LRU' form of lvx: lvxl,
327         *       because we deem it unlikely that the context
328         *       that was just loaded has to be saved again
329         *       to memory in the immediate future.
330         *
331         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
332         *           as explained above.
333         *
334         * MODIFIES: _O2 contains original _O1 + 64.
335         *           _VR.._VR+3 loaded from memory.
336         */
337        .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
338        addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
339        /* preload/touch 2 lines at offset 64 from _B0 */
340        \_DCBT   \_B0, \_O2
341        \_DCBT   \_B2, \_O2
342        /* load four vectors at off set 0 from _B0     */
343        LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
344        .endm
345
346        /*
347         * Symmetric to S8VEC_P; loads 8 vector registers
348         * from memory -- see comments above...
349         *
350         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
351         *           as explained above.
352         *
353         * MODIFIES: _O1 contains original _O1 + 128.
354         *           _O2 contains original _O1 +  64.
355         *           _VR.._VR+7 loaded from memory.
356         */
357        .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
358        L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
359        L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
360        .endm
361       
362        /*
363         * Load volatile vector registers v0..v19 employing
364         * the DCBT to preload the cache. The rationale for
365         * using DCBT here but not when restoring non-volatile
366         * registers is explained above, see
367         *
368         *    "NOTE REGARDING dcbt VS dst"
369         *
370         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
371         *           as explained above.
372         *
373         * MODIFIES: _O1 contains original _O1 + 256.
374         *           _O2 contains original _O1 + 256 - 64.
375         *           VR0..VR19 loaded from memory.
376         */
377        .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
378        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
379        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
380        LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
381        .endm
382
383        /*
384         * Load non-volatile vector registers v20..v31.
385         * Note that no DCBT is performed since we use
386         * DST for preloading the cache during a context
387         * switch, see
388         *
389         *    "NOTE REGARDING dcbt VS dst"
390         *
391         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
392         *           as explained above.
393         *
394         * MODIFIES: _O1 contains original _O1 + 128.
395         *           _O2 contains original _O1 + 128 - 64.
396         *           VR20..VR31 loaded from memory.
397         */
398        .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
399        L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
400        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
401        .endm
402
403        /*
404         * Load all registers from memory area.
405         */
406        .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
407        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
409        L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
410        L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
411        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
412        .endm
413
414        /*
415         * Compute
416         *     _B1 = _B0 + 16
417         *     _B2 = _B0 + 32
418         *     _B3 = _B0 + 48
419         * and load
420         *     _RO = 0
421         *
422         * convenience macro to be expanded before
423         * any of the load/store macros that use
424         * four base addresses etc.
425         *
426         * INPUT: _B0 = cache-aligned start of memory area
427         *
428         * MODIFIES: _B1, _B2, _B3, _RO as described above.
429         */
430        .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
431        addi       \_B1, \_B0, 1*VECSIZE
432        addi       \_B2, \_B0, 2*VECSIZE
433        addi       \_B3, \_B0, 3*VECSIZE
434        li         \_RO, 0
435        .endm
436
437        /*
438         * Prepare for saving general vector registers.
439         *
440         * If not built with #define IGNORE_VRSAVE then
441         *
442         *  1) copy vrsave to CRC
443         *
444         * endif
445         *
446         *  2) copy vrsave to _VRSAVE_REG
447         *  3) preload/zero cache line where vrsave and vscr are stored.
448         *  4) compute base adresses from _B0
449         *  5) preload/zero first two cache lines (remember that the
450         *     first S8VEC_P starts preloading/zeroing at offset 64).
451         *
452         * INPUT:    'vrsave' register, _B0 (base address of memory area)
453         * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
454         *           _B0 = original _BO + 32
455         *           _B1 = original _B0 + 32 + 16,
456         *           _B2 = original _B0 + 32 + 32,
457         *           _B3 = original _B0 + 32 + 48,
458         *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
459         */
460        .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
461        mfvrsave   \_VRSAVE_REG
462#ifndef IGNORE_VRSAVE
463        mtcr       \_VRSAVE_REG
464#endif
465        dcbz       0, \_B0
466        addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
467        dcbz       0, \_B0
468        CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
469        dcbz       0, \_B2
470        .endm
471
472        /*
473         * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
474         * must have been loaded from 'vrsave' and 'vscr', respectively,
475         * prior to expanding this macro.
476         *
477         * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
478         *           _VSCR_VREG  VR  holding 'vscr'   contents
479         *           _B0 cache-aligned (base) address of memory area.
480         * MODIFIES: _SCRATCH_REG
481         */
482        .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
483        stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
484        li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
485        stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
486        .endm
487
488        /*
489         * Load 'vrsave' and 'vscr' from memory.
490         *
491         * INPUTS:   _B0 cache-aligned (base) address of memory area.
492         * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
493         *           'vscr', 'vrsave'.
494         *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
495         *           with IGNORE_VRSAVE undefined).
496         */
497        .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
498        lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
499        mtvrsave  \_SCRATCH_REG
500#ifndef IGNORE_VRSAVE
501        mtcr      \_SCRATCH_REG
502#endif
503        li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
504        lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
505        mtvscr    \_SCRATCH_VREG
506        .endm
507
508        /*
509         * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
510         *
511         * INPUT:    _B0
512         * MODIFIES: _B0 (as stated above)
513         */
514        .macro CACHE_DOWNALGN _B0
515        rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
516        .endm
517
518        .text
519
520        .global _CPU_save_altivec_volatile
521_CPU_save_altivec_volatile:
522        /* Align address up to next cache-line boundary */
523        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
524        CACHE_DOWNALGN r3
525
526#ifndef IGNORE_VRSAVE
527        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
528         * when testing if we really should do the load/store operation.
529         */
530        mfcr      r12
531#endif
532
533        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
534        /* r0 now contains VRSAVE, r3 still the aligned memory area
535         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
536         * respectively. r10 holds zero
537         */
538        S_V0TOV19     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
539        mfvscr        v0
540        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
541        S_VSCR_VRSAVE r0, v0, r3, r11
542
543#ifndef IGNORE_VRSAVE
544        /* Restore CRC */
545        mtcr      r12
546#endif
547        blr
548
549        .global _CPU_load_altivec_volatile
550_CPU_load_altivec_volatile:
551        /* Align address up to next cache-line boundary */
552        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
553        CACHE_DOWNALGN r3
554#ifndef IGNORE_VRSAVE
555        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
556         * when testing if we really should do the load/store operation.
557         */
558        mfcr      r12
559#endif
560
561        /* Try to preload 1st line (where vscr and vrsave are stored) */
562        dcbt      0, r3
563        /* Point to start of general vector-register area             */
564        addi      r3, r3, PPC_CACHE_ALIGNMENT
565        /* Start preloading 2nd line (where first two vectors are)    */
566        dcbt      0, r3
567        L_VSCR_VRSAVE r3, r0, v0
568        CMP_BASES     r3, r4, r5, r6, r10
569        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
570        dcbt      0, r5
571        L_V0TOV19 r3, r4, r5, r6, r10, r11
572
573#ifndef IGNORE_VRSAVE
574        mtcr      r12
575#endif
576        blr
577
578        .global _CPU_Context_restore_altivec
579_CPU_Context_restore_altivec:
580        /* Restore is like 'switch' but we don't have
581         * to save an old context.
582         * Move argument to second arg and load NULL pointer
583         * to first one, then jump to 'switch' routine.
584         */
585        mr      r4, r3
586        li  r3,  0
587        b _CPU_Context_switch_altivec
588
589        .global _CPU_Context_switch_altivec
590_CPU_Context_switch_altivec:
591
592        /* fetch offset of altivec area in context                   */
593        CMPOFF    r5
594        /* down-align 'to' area to cache-line boundary               */
595        add       r4, r4, r5
596        CACHE_DOWNALGN r4
597
598        /* Check for PSIM                                            */
599        lis       r6, _CPU_altivec_psim_cpu@ha
600        lwz       r6, _CPU_altivec_psim_cpu@l(r6)
601        cmpli     0, r6, 0
602        bne       1f
603        /* Skip data-stream instructions on PSIM (not implemented)   */
604        dssall
605        /* Pre-load new context into cache                           */
606        lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
607        ori       r6, r6, BSTRIDE
608        dstt      r4, r6, ds0
6091:
610
611#ifndef IGNORE_VRSAVE
612        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
613         * when testing if we really should do the load/store operation.
614         */
615        mfcr      r12
616#endif
617
618        /* Is 'from' context == NULL ? (then we just do a 'restore') */
619        cmpli     0, r3, 0
620        beq       1f           /* yes: skip saving 'from' context    */
621
622        /* SAVE NON-VOLATILE REGISTERS                               */
623
624        /* Compute aligned destination pointer (r5 still holds offset
625         * to 'altivec' area in context)
626         */
627        add       r3, r3, r5
628        CACHE_DOWNALGN r3
629
630        PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
631        /* The manual says reading vscr can take some time - do
632         * read it here (into a volatile vector register) while
633         * we wait for cache blocks to be allocated
634         */
635        mfvscr    v0
636        S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
637        /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
638        S_VSCR_VRSAVE r0, v0, r3, r5
639
6401:
641
642        /* LOAD NON-VOLATILE REGISTERS                               */
643
644        /* Advance past vrsave/vscr area                             */
645        addi      r4, r4, PPC_CACHE_ALIGNMENT
646        L_VSCR_VRSAVE r4, r0, v0
647        CMP_BASES r4, r5, r6, r7, r10
648        L_V20TOV31 r4, r5, r6, r7, r10, r11
649
650#ifndef IGNORE_VRSAVE
651        mtcr      r12
652#endif
653        blr
654
655        .global _CPU_Context_initialize_altivec
656_CPU_Context_initialize_altivec:
657        CMPOFF    r5
658        add       r3, r3, r5
659        CACHE_DOWNALGN r3
660        lis       r5, _CPU_altivec_vrsave_initval@ha
661        lwz       r5, _CPU_altivec_vrsave_initval@l(r5)
662        stw       r5, VRSAVE_OFF(r3)
663        lis       r6, _CPU_altivec_vscr_initval@ha
664        lwz       r6, _CPU_altivec_vscr_initval@l(r6)
665        stw       r6, VSCR_OFF(r3)
666        blr
667
668        /*
669         * Change the initial value of VRSAVE.
670         * Can be used by initialization code if
671         * it is determined that code was compiled
672         * with -mvrsave=no. In this case, VRSAVE
673         * must be set to all-ones which causes this
674         * support code to save/restore *all* registers
675         * (only has an effect if IGNORE_VRSAVE is
676         * not defined -- otherwise all registers are
677         * saved/restored anyways).
678         */
679        .global _CPU_altivec_set_vrsave_initval
680_CPU_altivec_set_vrsave_initval:
681        lis       r5, _CPU_altivec_vrsave_initval@ha
682        stw       r3, _CPU_altivec_vrsave_initval@l(r5)
683        mtvrsave  r3
684        blr
685
686#ifdef ALTIVEC_TESTING
687        .global msr_VE_on
688msr_VE_on:
689        mfmsr r3
690        oris  r3, r3, 1<<(31-6-16)
691        mtmsr r3
692        blr
693
694        .global msr_VE_off
695msr_VE_off:
696        mfmsr r3
697        lis   r4,  1<<(31-6-16)
698        andc  r3, r3, r4
699        mtmsr r3
700        blr
701
702
703        .global mfvrsave
704mfvrsave:
705        mfvrsave r3
706        blr
707
708        .global mtvrsave
709mtvrsave:
710        mtvrsave r3
711        blr
712
713        /* Load all vector registers from memory area.
714         * NOTE: This routine is not strictly ABI compliant --
715         *       it guarantees that volatile vector registers
716         *       have certain values on exit!
717         */
718        .global _CPU_altivec_load_all
719_CPU_altivec_load_all:
720        /* Align address up to next cache-line boundary */
721        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
722        CACHE_DOWNALGN r3
723#ifndef IGNORE_VRSAVE
724        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
725         * when testing if we really should do the load/store operation.
726         */
727        mfcr      r12
728#endif
729
730        /* Try to preload 1st line (where vscr and vrsave are stored) */
731        dcbt      0, r3
732        /* Point to start of general vector-register area             */
733        addi      r3, r3, PPC_CACHE_ALIGNMENT
734        /* Start preloading 2nd line (where first two vectors are)    */
735        dcbt      0, r3
736        L_VSCR_VRSAVE r3, r0, v0
737        CMP_BASES     r3, r4, r5, r6, r10
738        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
739        dcbt      0, r5
740        L_V0TOV31 r3, r4, r5, r6, r10, r11
741
742#ifndef IGNORE_VRSAVE
743        mtcr      r12
744#endif
745        blr
746
747        .global _CPU_altivec_save_all
748_CPU_altivec_save_all:
749        /* Align address up to next cache-line boundary */
750        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
751        CACHE_DOWNALGN r3
752
753#ifndef IGNORE_VRSAVE
754        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
755         * when testing if we really should do the load/store operation.
756         */
757        mfcr      r12
758#endif
759
760        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
761        /* r0 now contains VRSAVE, r3 still the aligned memory area
762         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
763         * respectively. r10 holds zero
764         */
765        S_V0TOV31     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
766        mfvscr        v0
767        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
768        S_VSCR_VRSAVE r0, v0, r3, r11
769
770#ifndef IGNORE_VRSAVE
771        /* Restore CRC */
772        mtcr      r12
773#endif
774        blr
775
776
777#if 0
778        .gnu_attribute 4,1
779        .gnu_attribute 8,1
780#endif
781
782#endif
783#endif
Note: See TracBrowser for help on using the repository browser.