source: rtems/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @ a11e1ff5

Last change on this file since a11e1ff5 was a11e1ff5, checked in by Sebastian Huber <sebastian.huber@…>, on Mar 7, 2017 at 6:58:11 AM

powerpc: Optimize AltiVec? context switch

Use r8 instead of r5 to slightly optimize _CPU_Context_switch(). It is
not a big deal, however, we already assume r12 is used by
_CPU_Context_switch(). Treat r5 the in same way.

  • Property mode set to 100644
File size: 24.6 KB
Line 
1#ifdef __ALTIVEC__
2
3/* Altivec support for RTEMS; vector register context management.  */
4
5/*
6 * Authorship
7 * ----------
8 * This software was created by
9 *     Till Straumann <strauman@slac.stanford.edu>, 2009,
10 *         Stanford Linear Accelerator Center, Stanford University.
11 *
12 * Acknowledgement of sponsorship
13 * ------------------------------
14 * This software was produced by
15 *     the Stanford Linear Accelerator Center, Stanford University,
16 *         under Contract DE-AC03-76SFO0515 with the Department of Energy.
17 *
18 * Government disclaimer of liability
19 * ----------------------------------
20 * Neither the United States nor the United States Department of Energy,
21 * nor any of their employees, makes any warranty, express or implied, or
22 * assumes any legal liability or responsibility for the accuracy,
23 * completeness, or usefulness of any data, apparatus, product, or process
24 * disclosed, or represents that its use would not infringe privately owned
25 * rights.
26 *
27 * Stanford disclaimer of liability
28 * --------------------------------
29 * Stanford University makes no representations or warranties, express or
30 * implied, nor assumes any liability for the use of this software.
31 *
32 * Stanford disclaimer of copyright
33 * --------------------------------
34 * Stanford University, owner of the copyright, hereby disclaims its
35 * copyright and all other rights in this software.  Hence, anyone may
36 * freely use it for any purpose without restriction.
37 *
38 * Maintenance of notices
39 * ----------------------
40 * In the interest of clarity regarding the origin and status of this
41 * SLAC software, this and all the preceding Stanford University notices
42 * are to remain affixed to any copy or derivative of this software made
43 * or distributed by the recipient and are to be affixed to any copy of
44 * software made or distributed by the recipient that contains a copy or
45 * derivative of this software.
46 *
47 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48 */
49
50
51#include <rtems/powerpc/powerpc.h>
52
53#ifndef PPC_CACHE_ALIGNMENT
54#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55#endif
56
57#define ALTIVEC_TESTING
58
59#if PPC_CACHE_ALIGNMENT != 32
60#error "Altivec support assumes cache-line size is 32 bytes!"
61#else
62#undef  LD_PPC_CACHE_ALIGNMENT
63#define LD_PPC_CACHE_ALIGNMENT 5
64#endif
65
66        .set   v0,   0
67        .set   v8,   8
68        .set   v16, 16
69        .set   v20, 20
70        .set   v24, 24
71        .set   v28, 28
72
73        .set   r0,   0
74        .set   r3,   3
75        .set   r4,   4
76        /* Do not use r5, since this is used by _CPU_Context_switch() */
77        .set   r6,   6
78        .set   r7,   7
79        .set   r8,   8
80        .set   r9,   9
81        .set   r10, 10
82        .set   r11, 11
83        /* Do not use r12, since this is used by _CPU_Context_switch() */
84
85        .set   cr5,  5
86
87        .set   VECSIZE,    16
88
89        .set   VRSAVE_INIT_VAL, 0
90        .set   VSCR_INIT_VAL,   0
91
92        .set   VRSAVE_OFF, 16
93        .set   VSCR_OFF,   16+12
94
95        .set   ds0,  0
96
97        /* Block size for dst -- in units of 16-bytes */
98        .set   BSIZE,   2       /* = 32 bytes */
99        .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
100        .set   BSTRIDE, 32      /*      bytes */
101
102        .data
103
104        .global _CPU_altivec_vrsave_initval
105_CPU_altivec_vrsave_initval:
106        .long   0
107
108        .global _CPU_altivec_vscr_initval
109_CPU_altivec_vscr_initval:
110        .long   0
111
112        .text
113
114        .extern _CPU_altivec_psim_cpu
115        .extern _CPU_altivec_ctxt_off
116
117        .macro  CMPOFF _B0
118        lis     \_B0, _CPU_altivec_ctxt_off@ha
119        lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
120        .endm
121
122        /* Conditionally load or store a vector _VR to
123     *  EA(_R1|0 + _R2)
124         * If bit _VR (corresponding to _VR) is set in CRC
125         * then the load/store is performed but otherwise
126         * it is skipped.
127         * If compiled with IGNORE_VRSAVE defined then
128         * the load/store is done unconditionally.
129         *
130         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
131         * _VR    : target vector register
132         * _R1    : base register (NOTE: _R1=r0 uses a
133         *          implicit ZERO constant, not the contents
134         *          of r0) for address computation.
135         * _R2    : 'offset' register for address computation.
136         *
137         * MODIFIES:      _VR on output if a load operation is performed.
138         * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
139         *                defined.
140         */
141        .macro LDST _OPCODE, _VR, _R1, _R2
142#ifndef IGNORE_VRSAVE
143        bc       4, \_VR, 111f
144#endif
145        \_OPCODE \_VR, \_R1, \_R2
146111:
147        .endm
148
149        /*
150         * Load or store four 'adjacent' vector registers.
151         *
152         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
153         * _VR    : target vector register
154         * _R1    : base register (NOTE: _R1=r0 uses a
155         *          implicit ZERO constant, not the contents
156         *          of r0) for address computation.
157         * _B0    : base register 0
158         * _B1    : base register 1
159         * _B2    : base register 2
160         * _B3    : base register 3
161         * _RO    : offset register
162         *
163         * memory addresses for _VR, _VR+1, _VR+2, _VR+3
164         * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
165         *
166         * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
167         *                operation is performed.
168         * IMPLICIT USE:  see LDST
169         */
170        .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
171        LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
172        LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
173        LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
174        LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
175        .endm
176
177        /*
178         * Preload/zero two cache lines and save 4 vector registers
179         * to memory.
180         * Note that the cache operation targets memory *past* the
181         * current storage area which should hopefully hit when
182         * This same code is executed on the next two cache lines...
183         *
184         * This code effectively does
185         *   dcbz (_B0 + 64)
186         *   dcbz (_B0 + 64 + 32)
187         *   stvx _VF+0, (_B0+ 0)
188         *   stvx _VF+1, (_B0+16)
189         *   stvx _VF+2, (_B0+32)
190         *   stvx _VF+3, (_B0+48)
191         *
192         * _LRU:  may be 'l' or empty. The former variant should be
193         *        used when it is conceivable that the memory area is
194         *        unlikely to be used in the near future thus making
195         *        it a candidate for early eviction from the caches.
196         *
197         *        If it is likely that the memory area is reused soon
198         *        (e.g., save/restore across ISR execution) then the
199         *        'stvx' opcode (w/o 'l' suffix) should be used.
200         *
201         * _VR:   first of four target vector registers; _VR+0,
202         *        _VR+1, _VR+2, _VR+3 are saved.
203         *
204         * _BO:   base address of memory area.
205         * _B1:   should contain _B0+16 on entry
206         * _B2:   should contain _B0+32 on entry
207         * _B3:   should contain _B0+48 on entry
208         *
209         * _O1:   contains the offset where the four vectors are
210         *        stored.
211         *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
212         *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
213         *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
214         *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
215         * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
216     *        used to address the two cache-lines past the
217         *        current memory area.
218         *
219         * MODIFIES: _O2; contains _O1 + 64 after execution of this
220         *        code.
221         *
222         * NOTES: a different set of four vectors can be addressed
223         *        simply by changing the one offset register _O1.
224         *
225         *        Saving more than 4 registers can simply be
226         *        achieved by expanding this macro multiple
227         *        times with _O1 and _O2 swapped (new _O1
228         *        becomes _O2 = old _O1 + 64) thus stepping
229         *        through the memory area.
230         *
231         */
232        .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
233        addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
234        dcbz  \_B0, \_O2
235        dcbz  \_B2, \_O2
236        LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
237        .endm
238
239        /*
240         * Save eight vector registers by expanding S4VEC_P twice.
241         * See notes for S4VEC_P above.
242         *
243         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
244         *
245         * MODIFIES: After execution,
246         *           _O2 contains original _O1 +  64,
247         *           _O1 contains original _O1 + 128
248         *
249         * NOTES:    Expanding this macro multiple times lets you save
250         *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
251         */
252        .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
253        S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
254        /* Note that the roles of _O1 and _O2 are swapped here */
255        S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
256        .endm
257
258        /*
259         * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
260         *
261         * See notes above (for S4VEC_P).
262         *
263         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
264         * MODIFIES: _O1 contains original _O1 + 256
265         *           _O2 contains original _O1 + 256 - 64
266         */
267        .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
268        S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269        S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
270        LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
271        .endm
272
273        /*
274         * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
275         *
276         * See notes above (for S4VEC_P, S_V0TOV19).
277         *
278         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
279         * MODIFIES: _O1 contains original _O1 + 128
280         *           _O2 contains original _O1 + 128 - 64
281         */
282        .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
283        S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
284        LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
285        .endm
286
287        /*
288         * Save all registers to memory area
289         *
290         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
291         * MODIFIES: _O1 contains original _O1 + 512
292         *           _O2 contains original _O1 + 512 - 64
293         */
294        .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
295        S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296        S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297        S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298        S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
299        LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
300        .endm
301
302
303        /*
304         * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
305         * We can pass either of them as arguments to another macro which
306         * allows us to decide if the main macro uses dcbt or not when
307         * we expand it...
308         */
309        .macro DO_DCBT _RA, _RB
310        dcbt \_RA, \_RB
311        .endm
312
313        .macro NO_DCBT _RA, _RB
314        .endm
315
316        /*
317         * NOTE REGARDING dcbt VS dst
318         *
319         * Preloading the cache with memory areas that we soon need
320         * can be done either using 'dcbt' or 'dst' instructions
321         * "ahead of time".
322         * When experimenting (on a mpc7457) I found that the 'dst'
323         * stream instruction was very efficient if there is enough
324         * time to read ahead. It works well when we do a context
325         * switch:
326         *
327         *   1) start DST on new context to be loaded
328         *   2) save old context to memory
329         *   3) load new context from memory
330         *
331         * Because of the interleaved step 2) dst works nicely and
332         * 3) finds what it needs in the cache.
333         *
334         * However, in a situation when there is not much time
335         * to start the DST, e.g., because we want to restore
336         * a context out of the blue (e.g., after returning
337         * from and ISR):
338         *
339         *   1) save volatile registers to memory/stack
340         *   2) execute ISR
341         *   3) might do a task context switch
342         *   4) when returned to old task context then
343         *      reload volatile registers from memory/stack.
344         *
345         * In this situation, preloading the target memory before
346         * or after step 1) makes obviously no sense because after
347         * 1) the registers area is most likely in the cache already.
348         *
349         * Starting preload after 2) doesn't make much sense either.
350         * If ISR doesn't lead to a context switch then it is quite
351         * likely that the register area is still in the cache.
352         * OTOTH, if a context switch happens then the preload after 2)
353         * might be useless.
354         *
355         * This leaves us at step 4) where we want to load immediately.
356         * In this case, I found that 'dcbt' works more efficiently
357         * so that's what we use when restoring volatile registers.
358         *
359         * When restoring the non-volatile VRs during a 'normal'
360         * context switch then we shall use DST (and no dcbt).
361         */
362
363        /*
364         * Symmetric to S4VEC_P above but addresses loading four
365         * vector registers from memory.
366         *
367         * Touches two cache lines past the current memory area
368         * and loads four vectors from the current area.
369         *
370         * Optionally, the DCBT operation may be omitted
371         * (when expanding with _DCBT=NO_DCBT).
372         * This is useful if the cache was already preloaded
373         * by another means (dst instruction).
374         *
375         * NOTE: We always use the 'LRU' form of lvx: lvxl,
376         *       because we deem it unlikely that the context
377         *       that was just loaded has to be saved again
378         *       to memory in the immediate future.
379         *
380         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
381         *           as explained above.
382         *
383         * MODIFIES: _O2 contains original _O1 + 64.
384         *           _VR.._VR+3 loaded from memory.
385         */
386        .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
387        addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
388        /* preload/touch 2 lines at offset 64 from _B0 */
389        \_DCBT   \_B0, \_O2
390        \_DCBT   \_B2, \_O2
391        /* load four vectors at off set 0 from _B0     */
392        LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
393        .endm
394
395        /*
396         * Symmetric to S8VEC_P; loads 8 vector registers
397         * from memory -- see comments above...
398         *
399         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
400         *           as explained above.
401         *
402         * MODIFIES: _O1 contains original _O1 + 128.
403         *           _O2 contains original _O1 +  64.
404         *           _VR.._VR+7 loaded from memory.
405         */
406        .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
407        L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408        L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
409        .endm
410       
411        /*
412         * Load volatile vector registers v0..v19 employing
413         * the DCBT to preload the cache. The rationale for
414         * using DCBT here but not when restoring non-volatile
415         * registers is explained above, see
416         *
417         *    "NOTE REGARDING dcbt VS dst"
418         *
419         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
420         *           as explained above.
421         *
422         * MODIFIES: _O1 contains original _O1 + 256.
423         *           _O2 contains original _O1 + 256 - 64.
424         *           VR0..VR19 loaded from memory.
425         */
426        .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
427        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
429        LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
430        .endm
431
432        /*
433         * Load non-volatile vector registers v20..v31.
434         * Note that no DCBT is performed since we use
435         * DST for preloading the cache during a context
436         * switch, see
437         *
438         *    "NOTE REGARDING dcbt VS dst"
439         *
440         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
441         *           as explained above.
442         *
443         * MODIFIES: _O1 contains original _O1 + 128.
444         *           _O2 contains original _O1 + 128 - 64.
445         *           VR20..VR31 loaded from memory.
446         */
447        .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
448        L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
449        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
450        .endm
451
452        /*
453         * Load all registers from memory area.
454         */
455        .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
456        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458        L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459        L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
460        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
461        .endm
462
463        /*
464         * Compute
465         *     _B1 = _B0 + 16
466         *     _B2 = _B0 + 32
467         *     _B3 = _B0 + 48
468         * and load
469         *     _RO = 0
470         *
471         * convenience macro to be expanded before
472         * any of the load/store macros that use
473         * four base addresses etc.
474         *
475         * INPUT: _B0 = cache-aligned start of memory area
476         *
477         * MODIFIES: _B1, _B2, _B3, _RO as described above.
478         */
479        .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
480        addi       \_B1, \_B0, 1*VECSIZE
481        addi       \_B2, \_B0, 2*VECSIZE
482        addi       \_B3, \_B0, 3*VECSIZE
483        li         \_RO, 0
484        .endm
485
486        /*
487         * Prepare for saving general vector registers.
488         *
489         * If not built with #define IGNORE_VRSAVE then
490         *
491         *  1) copy vrsave to CRC
492         *
493         * endif
494         *
495         *  2) copy vrsave to _VRSAVE_REG
496         *  3) preload/zero cache line where vrsave and vscr are stored.
497         *  4) compute base adresses from _B0
498         *  5) preload/zero first two cache lines (remember that the
499         *     first S8VEC_P starts preloading/zeroing at offset 64).
500         *
501         * INPUT:    'vrsave' register, _B0 (base address of memory area)
502         * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
503         *           _B0 = original _BO + 32
504         *           _B1 = original _B0 + 32 + 16,
505         *           _B2 = original _B0 + 32 + 32,
506         *           _B3 = original _B0 + 32 + 48,
507         *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
508         */
509        .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
510        mfvrsave   \_VRSAVE_REG
511#ifndef IGNORE_VRSAVE
512        mtcr       \_VRSAVE_REG
513#endif
514        dcbz       0, \_B0
515        addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
516        dcbz       0, \_B0
517        CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
518        dcbz       0, \_B2
519        .endm
520
521        /*
522         * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
523         * must have been loaded from 'vrsave' and 'vscr', respectively,
524         * prior to expanding this macro.
525         *
526         * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
527         *           _VSCR_VREG  VR  holding 'vscr'   contents
528         *           _B0 cache-aligned (base) address of memory area.
529         * MODIFIES: _SCRATCH_REG
530         */
531        .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
532        stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
533        li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
534        stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
535        .endm
536
537        /*
538         * Load 'vrsave' and 'vscr' from memory.
539         *
540         * INPUTS:   _B0 cache-aligned (base) address of memory area.
541         * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
542         *           'vscr', 'vrsave'.
543         *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
544         *           with IGNORE_VRSAVE undefined).
545         */
546        .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
547        lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
548        mtvrsave  \_SCRATCH_REG
549#ifndef IGNORE_VRSAVE
550        mtcr      \_SCRATCH_REG
551#endif
552        li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
553        lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
554        mtvscr    \_SCRATCH_VREG
555        .endm
556
557        /*
558         * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
559         *
560         * INPUT:    _B0
561         * MODIFIES: _B0 (as stated above)
562         */
563        .macro CACHE_DOWNALGN _B0
564        rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
565        .endm
566
567        .text
568
569        .global _CPU_save_altivec_volatile
570_CPU_save_altivec_volatile:
571        /* Align address up to next cache-line boundary */
572        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
573        CACHE_DOWNALGN r3
574
575#ifndef IGNORE_VRSAVE
576        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
577         * when testing if we really should do the load/store operation.
578         */
579        mfcr      r9
580#endif
581
582        PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
583        /* r0 now contains VRSAVE, r3 still the aligned memory area
584         * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
585         * respectively. r10 holds zero
586         */
587        S_V0TOV19     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
588        mfvscr        v0
589        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
590        S_VSCR_VRSAVE r0, v0, r3, r11
591
592#ifndef IGNORE_VRSAVE
593        /* Restore CRC */
594        mtcr      r9
595#endif
596        blr
597
598        .global _CPU_load_altivec_volatile
599_CPU_load_altivec_volatile:
600        /* Align address up to next cache-line boundary */
601        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
602        CACHE_DOWNALGN r3
603#ifndef IGNORE_VRSAVE
604        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
605         * when testing if we really should do the load/store operation.
606         */
607        mfcr      r9
608#endif
609
610        /* Try to preload 1st line (where vscr and vrsave are stored) */
611        dcbt      0, r3
612        /* Point to start of general vector-register area             */
613        addi      r3, r3, PPC_CACHE_ALIGNMENT
614        /* Start preloading 2nd line (where first two vectors are)    */
615        dcbt      0, r3
616        L_VSCR_VRSAVE r3, r0, v0
617        CMP_BASES     r3, r4, r8, r6, r10
618        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
619        dcbt      0, r8
620        L_V0TOV19 r3, r4, r8, r6, r10, r11
621
622#ifndef IGNORE_VRSAVE
623        mtcr      r9
624#endif
625        blr
626
627        .global _CPU_Context_switch_altivec
628_CPU_Context_switch_altivec:
629
630        /* fetch offset of altivec area in context                   */
631        CMPOFF    r8
632        /* down-align 'to' area to cache-line boundary               */
633        add       r4, r4, r8
634        CACHE_DOWNALGN r4
635
636        /* Check for PSIM                                            */
637        lis       r6, _CPU_altivec_psim_cpu@ha
638        lwz       r6, _CPU_altivec_psim_cpu@l(r6)
639        cmpli     0, r6, 0
640        bne       1f
641        /* Skip data-stream instructions on PSIM (not implemented)   */
642        dssall
643        /* Pre-load new context into cache                           */
644        lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
645        ori       r6, r6, BSTRIDE
646        dstt      r4, r6, ds0
6471:
648
649#ifndef IGNORE_VRSAVE
650        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
651         * when testing if we really should do the load/store operation.
652         */
653        mfcr      r9
654#endif
655
656        /* Is 'from' context == NULL ? (then we just do a 'restore') */
657        cmpli     0, r3, 0
658        beq       1f           /* yes: skip saving 'from' context    */
659
660        /* SAVE NON-VOLATILE REGISTERS                               */
661
662        /* Compute aligned destination pointer (r8 still holds offset
663         * to 'altivec' area in context)
664         */
665        add       r3, r3, r8
666        CACHE_DOWNALGN r3
667
668        PREP_FOR_SAVE r0, r3, r8, r6, r7, r10
669        /* The manual says reading vscr can take some time - do
670         * read it here (into a volatile vector register) while
671         * we wait for cache blocks to be allocated
672         */
673        mfvscr    v0
674        S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11
675        /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
676        S_VSCR_VRSAVE r0, v0, r3, r8
677
6781:
679
680        /* LOAD NON-VOLATILE REGISTERS                               */
681
682        /* Advance past vrsave/vscr area                             */
683        addi      r4, r4, PPC_CACHE_ALIGNMENT
684        L_VSCR_VRSAVE r4, r0, v0
685        CMP_BASES r4, r8, r6, r7, r10
686        L_V20TOV31 r4, r8, r6, r7, r10, r11
687
688#ifndef IGNORE_VRSAVE
689        mtcr      r9
690#endif
691        blr
692
693        .global _CPU_Context_initialize_altivec
694_CPU_Context_initialize_altivec:
695        CMPOFF    r8
696        add       r3, r3, r8
697        CACHE_DOWNALGN r3
698        lis       r8, _CPU_altivec_vrsave_initval@ha
699        lwz       r8, _CPU_altivec_vrsave_initval@l(r8)
700        stw       r8, VRSAVE_OFF(r3)
701        lis       r6, _CPU_altivec_vscr_initval@ha
702        lwz       r6, _CPU_altivec_vscr_initval@l(r6)
703        stw       r6, VSCR_OFF(r3)
704        blr
705
706        /*
707         * Change the initial value of VRSAVE.
708         * Can be used by initialization code if
709         * it is determined that code was compiled
710         * with -mvrsave=no. In this case, VRSAVE
711         * must be set to all-ones which causes this
712         * support code to save/restore *all* registers
713         * (only has an effect if IGNORE_VRSAVE is
714         * not defined -- otherwise all registers are
715         * saved/restored anyways).
716         */
717        .global _CPU_altivec_set_vrsave_initval
718_CPU_altivec_set_vrsave_initval:
719        lis       r8, _CPU_altivec_vrsave_initval@ha
720        stw       r3, _CPU_altivec_vrsave_initval@l(r8)
721        mtvrsave  r3
722        blr
723
724#ifdef ALTIVEC_TESTING
725        .global msr_VE_on
726msr_VE_on:
727        mfmsr r3
728        oris  r3, r3, 1<<(31-6-16)
729        mtmsr r3
730        blr
731
732        .global msr_VE_off
733msr_VE_off:
734        mfmsr r3
735        lis   r4,  1<<(31-6-16)
736        andc  r3, r3, r4
737        mtmsr r3
738        blr
739
740
741        .global mfvrsave
742mfvrsave:
743        mfvrsave r3
744        blr
745
746        .global mtvrsave
747mtvrsave:
748        mtvrsave r3
749        blr
750
751        /* Load all vector registers from memory area.
752         * NOTE: This routine is not strictly ABI compliant --
753         *       it guarantees that volatile vector registers
754         *       have certain values on exit!
755         */
756        .global _CPU_altivec_load_all
757_CPU_altivec_load_all:
758        /* Align address up to next cache-line boundary */
759        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
760        CACHE_DOWNALGN r3
761#ifndef IGNORE_VRSAVE
762        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
763         * when testing if we really should do the load/store operation.
764         */
765        mfcr      r9
766#endif
767
768        /* Try to preload 1st line (where vscr and vrsave are stored) */
769        dcbt      0, r3
770        /* Point to start of general vector-register area             */
771        addi      r3, r3, PPC_CACHE_ALIGNMENT
772        /* Start preloading 2nd line (where first two vectors are)    */
773        dcbt      0, r3
774        L_VSCR_VRSAVE r3, r0, v0
775        CMP_BASES     r3, r4, r8, r6, r10
776        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
777        dcbt      0, r8
778        L_V0TOV31 r3, r4, r8, r6, r10, r11
779
780#ifndef IGNORE_VRSAVE
781        mtcr      r9
782#endif
783        blr
784
785        .global _CPU_altivec_save_all
786_CPU_altivec_save_all:
787        /* Align address up to next cache-line boundary */
788        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
789        CACHE_DOWNALGN r3
790
791#ifndef IGNORE_VRSAVE
792        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
793         * when testing if we really should do the load/store operation.
794         */
795        mfcr      r9
796#endif
797
798        PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
799        /* r0 now contains VRSAVE, r3 still the aligned memory area
800         * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
801         * respectively. r10 holds zero
802         */
803        S_V0TOV31     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
804        mfvscr        v0
805        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
806        S_VSCR_VRSAVE r0, v0, r3, r11
807
808#ifndef IGNORE_VRSAVE
809        /* Restore CRC */
810        mtcr      r9
811#endif
812        blr
813
814
815#if 0
816        .gnu_attribute 4,1
817        .gnu_attribute 8,1
818#endif
819
820#endif
821#endif
Note: See TracBrowser for help on using the repository browser.