source: rtems/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @ c6f76392

Last change on this file since c6f76392 was c6f76392, checked in by Sebastian Huber <sebastian.huber@…>, on Mar 7, 2017 at 6:50:12 AM

powerpc: Fix AltiVec? context switch

Update #2751.

  • Property mode set to 100644
File size: 24.5 KB
Line 
1#ifdef __ALTIVEC__
2
3/* Altivec support for RTEMS; vector register context management.  */
4
5/*
6 * Authorship
7 * ----------
8 * This software was created by
9 *     Till Straumann <strauman@slac.stanford.edu>, 2009,
10 *         Stanford Linear Accelerator Center, Stanford University.
11 *
12 * Acknowledgement of sponsorship
13 * ------------------------------
14 * This software was produced by
15 *     the Stanford Linear Accelerator Center, Stanford University,
16 *         under Contract DE-AC03-76SFO0515 with the Department of Energy.
17 *
18 * Government disclaimer of liability
19 * ----------------------------------
20 * Neither the United States nor the United States Department of Energy,
21 * nor any of their employees, makes any warranty, express or implied, or
22 * assumes any legal liability or responsibility for the accuracy,
23 * completeness, or usefulness of any data, apparatus, product, or process
24 * disclosed, or represents that its use would not infringe privately owned
25 * rights.
26 *
27 * Stanford disclaimer of liability
28 * --------------------------------
29 * Stanford University makes no representations or warranties, express or
30 * implied, nor assumes any liability for the use of this software.
31 *
32 * Stanford disclaimer of copyright
33 * --------------------------------
34 * Stanford University, owner of the copyright, hereby disclaims its
35 * copyright and all other rights in this software.  Hence, anyone may
36 * freely use it for any purpose without restriction.
37 *
38 * Maintenance of notices
39 * ----------------------
40 * In the interest of clarity regarding the origin and status of this
41 * SLAC software, this and all the preceding Stanford University notices
42 * are to remain affixed to any copy or derivative of this software made
43 * or distributed by the recipient and are to be affixed to any copy of
44 * software made or distributed by the recipient that contains a copy or
45 * derivative of this software.
46 *
47 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
48 */
49
50
51#include <rtems/powerpc/powerpc.h>
52
53#ifndef PPC_CACHE_ALIGNMENT
54#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
55#endif
56
57#define ALTIVEC_TESTING
58
59#if PPC_CACHE_ALIGNMENT != 32
60#error "Altivec support assumes cache-line size is 32 bytes!"
61#else
62#undef  LD_PPC_CACHE_ALIGNMENT
63#define LD_PPC_CACHE_ALIGNMENT 5
64#endif
65
66        .set   v0,   0
67        .set   v8,   8
68        .set   v16, 16
69        .set   v20, 20
70        .set   v24, 24
71        .set   v28, 28
72
73        .set   r0,   0
74        .set   r3,   3
75        .set   r4,   4
76        .set   r5,   5
77        .set   r6,   6
78        .set   r7,   7
79        .set   r9,   9
80        .set   r10, 10
81        .set   r11, 11
82        /* Do not use r12, since this is used by _CPU_Context_switch() */
83
84        .set   cr5,  5
85
86        .set   VECSIZE,    16
87
88        .set   VRSAVE_INIT_VAL, 0
89        .set   VSCR_INIT_VAL,   0
90
91        .set   VRSAVE_OFF, 16
92        .set   VSCR_OFF,   16+12
93
94        .set   ds0,  0
95
96        /* Block size for dst -- in units of 16-bytes */
97        .set   BSIZE,   2       /* = 32 bytes */
98        .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
99        .set   BSTRIDE, 32      /*      bytes */
100
101        .data
102
103        .global _CPU_altivec_vrsave_initval
104_CPU_altivec_vrsave_initval:
105        .long   0
106
107        .global _CPU_altivec_vscr_initval
108_CPU_altivec_vscr_initval:
109        .long   0
110
111        .text
112
113        .extern _CPU_altivec_psim_cpu
114        .extern _CPU_altivec_ctxt_off
115
116        .macro  CMPOFF _B0
117        lis     \_B0, _CPU_altivec_ctxt_off@ha
118        lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
119        .endm
120
121        /* Conditionally load or store a vector _VR to
122     *  EA(_R1|0 + _R2)
123         * If bit _VR (corresponding to _VR) is set in CRC
124         * then the load/store is performed but otherwise
125         * it is skipped.
126         * If compiled with IGNORE_VRSAVE defined then
127         * the load/store is done unconditionally.
128         *
129         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
130         * _VR    : target vector register
131         * _R1    : base register (NOTE: _R1=r0 uses a
132         *          implicit ZERO constant, not the contents
133         *          of r0) for address computation.
134         * _R2    : 'offset' register for address computation.
135         *
136         * MODIFIES:      _VR on output if a load operation is performed.
137         * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
138         *                defined.
139         */
140        .macro LDST _OPCODE, _VR, _R1, _R2
141#ifndef IGNORE_VRSAVE
142        bc       4, \_VR, 111f
143#endif
144        \_OPCODE \_VR, \_R1, \_R2
145111:
146        .endm
147
148        /*
149         * Load or store four 'adjacent' vector registers.
150         *
151         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
152         * _VR    : target vector register
153         * _R1    : base register (NOTE: _R1=r0 uses a
154         *          implicit ZERO constant, not the contents
155         *          of r0) for address computation.
156         * _B0    : base register 0
157         * _B1    : base register 1
158         * _B2    : base register 2
159         * _B3    : base register 3
160         * _RO    : offset register
161         *
162         * memory addresses for _VR, _VR+1, _VR+2, _VR+3
163         * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
164         *
165         * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
166         *                operation is performed.
167         * IMPLICIT USE:  see LDST
168         */
169        .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
170        LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
171        LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
172        LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
173        LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
174        .endm
175
176        /*
177         * Preload/zero two cache lines and save 4 vector registers
178         * to memory.
179         * Note that the cache operation targets memory *past* the
180         * current storage area which should hopefully hit when
181         * This same code is executed on the next two cache lines...
182         *
183         * This code effectively does
184         *   dcbz (_B0 + 64)
185         *   dcbz (_B0 + 64 + 32)
186         *   stvx _VF+0, (_B0+ 0)
187         *   stvx _VF+1, (_B0+16)
188         *   stvx _VF+2, (_B0+32)
189         *   stvx _VF+3, (_B0+48)
190         *
191         * _LRU:  may be 'l' or empty. The former variant should be
192         *        used when it is conceivable that the memory area is
193         *        unlikely to be used in the near future thus making
194         *        it a candidate for early eviction from the caches.
195         *
196         *        If it is likely that the memory area is reused soon
197         *        (e.g., save/restore across ISR execution) then the
198         *        'stvx' opcode (w/o 'l' suffix) should be used.
199         *
200         * _VR:   first of four target vector registers; _VR+0,
201         *        _VR+1, _VR+2, _VR+3 are saved.
202         *
203         * _BO:   base address of memory area.
204         * _B1:   should contain _B0+16 on entry
205         * _B2:   should contain _B0+32 on entry
206         * _B3:   should contain _B0+48 on entry
207         *
208         * _O1:   contains the offset where the four vectors are
209         *        stored.
210         *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
211         *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
212         *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
213         *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
214         * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
215     *        used to address the two cache-lines past the
216         *        current memory area.
217         *
218         * MODIFIES: _O2; contains _O1 + 64 after execution of this
219         *        code.
220         *
221         * NOTES: a different set of four vectors can be addressed
222         *        simply by changing the one offset register _O1.
223         *
224         *        Saving more than 4 registers can simply be
225         *        achieved by expanding this macro multiple
226         *        times with _O1 and _O2 swapped (new _O1
227         *        becomes _O2 = old _O1 + 64) thus stepping
228         *        through the memory area.
229         *
230         */
231        .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
232        addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
233        dcbz  \_B0, \_O2
234        dcbz  \_B2, \_O2
235        LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
236        .endm
237
238        /*
239         * Save eight vector registers by expanding S4VEC_P twice.
240         * See notes for S4VEC_P above.
241         *
242         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
243         *
244         * MODIFIES: After execution,
245         *           _O2 contains original _O1 +  64,
246         *           _O1 contains original _O1 + 128
247         *
248         * NOTES:    Expanding this macro multiple times lets you save
249         *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
250         */
251        .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
252        S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
253        /* Note that the roles of _O1 and _O2 are swapped here */
254        S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
255        .endm
256
257        /*
258         * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
259         *
260         * See notes above (for S4VEC_P).
261         *
262         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
263         * MODIFIES: _O1 contains original _O1 + 256
264         *           _O2 contains original _O1 + 256 - 64
265         */
266        .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
267        S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
268        S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269        LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
270        .endm
271
272        /*
273         * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
274         *
275         * See notes above (for S4VEC_P, S_V0TOV19).
276         *
277         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
278         * MODIFIES: _O1 contains original _O1 + 128
279         *           _O2 contains original _O1 + 128 - 64
280         */
281        .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
282        S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
283        LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
284        .endm
285
286        /*
287         * Save all registers to memory area
288         *
289         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
290         * MODIFIES: _O1 contains original _O1 + 512
291         *           _O2 contains original _O1 + 512 - 64
292         */
293        .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
294        S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
295        S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296        S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297        S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298        LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
299        .endm
300
301
302        /*
303         * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
304         * We can pass either of them as arguments to another macro which
305         * allows us to decide if the main macro uses dcbt or not when
306         * we expand it...
307         */
308        .macro DO_DCBT _RA, _RB
309        dcbt \_RA, \_RB
310        .endm
311
312        .macro NO_DCBT _RA, _RB
313        .endm
314
315        /*
316         * NOTE REGARDING dcbt VS dst
317         *
318         * Preloading the cache with memory areas that we soon need
319         * can be done either using 'dcbt' or 'dst' instructions
320         * "ahead of time".
321         * When experimenting (on a mpc7457) I found that the 'dst'
322         * stream instruction was very efficient if there is enough
323         * time to read ahead. It works well when we do a context
324         * switch:
325         *
326         *   1) start DST on new context to be loaded
327         *   2) save old context to memory
328         *   3) load new context from memory
329         *
330         * Because of the interleaved step 2) dst works nicely and
331         * 3) finds what it needs in the cache.
332         *
333         * However, in a situation when there is not much time
334         * to start the DST, e.g., because we want to restore
335         * a context out of the blue (e.g., after returning
336         * from and ISR):
337         *
338         *   1) save volatile registers to memory/stack
339         *   2) execute ISR
340         *   3) might do a task context switch
341         *   4) when returned to old task context then
342         *      reload volatile registers from memory/stack.
343         *
344         * In this situation, preloading the target memory before
345         * or after step 1) makes obviously no sense because after
346         * 1) the registers area is most likely in the cache already.
347         *
348         * Starting preload after 2) doesn't make much sense either.
349         * If ISR doesn't lead to a context switch then it is quite
350         * likely that the register area is still in the cache.
351         * OTOTH, if a context switch happens then the preload after 2)
352         * might be useless.
353         *
354         * This leaves us at step 4) where we want to load immediately.
355         * In this case, I found that 'dcbt' works more efficiently
356         * so that's what we use when restoring volatile registers.
357         *
358         * When restoring the non-volatile VRs during a 'normal'
359         * context switch then we shall use DST (and no dcbt).
360         */
361
362        /*
363         * Symmetric to S4VEC_P above but addresses loading four
364         * vector registers from memory.
365         *
366         * Touches two cache lines past the current memory area
367         * and loads four vectors from the current area.
368         *
369         * Optionally, the DCBT operation may be omitted
370         * (when expanding with _DCBT=NO_DCBT).
371         * This is useful if the cache was already preloaded
372         * by another means (dst instruction).
373         *
374         * NOTE: We always use the 'LRU' form of lvx: lvxl,
375         *       because we deem it unlikely that the context
376         *       that was just loaded has to be saved again
377         *       to memory in the immediate future.
378         *
379         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
380         *           as explained above.
381         *
382         * MODIFIES: _O2 contains original _O1 + 64.
383         *           _VR.._VR+3 loaded from memory.
384         */
385        .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
386        addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
387        /* preload/touch 2 lines at offset 64 from _B0 */
388        \_DCBT   \_B0, \_O2
389        \_DCBT   \_B2, \_O2
390        /* load four vectors at off set 0 from _B0     */
391        LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
392        .endm
393
394        /*
395         * Symmetric to S8VEC_P; loads 8 vector registers
396         * from memory -- see comments above...
397         *
398         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
399         *           as explained above.
400         *
401         * MODIFIES: _O1 contains original _O1 + 128.
402         *           _O2 contains original _O1 +  64.
403         *           _VR.._VR+7 loaded from memory.
404         */
405        .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
406        L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
407        L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
408        .endm
409       
410        /*
411         * Load volatile vector registers v0..v19 employing
412         * the DCBT to preload the cache. The rationale for
413         * using DCBT here but not when restoring non-volatile
414         * registers is explained above, see
415         *
416         *    "NOTE REGARDING dcbt VS dst"
417         *
418         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
419         *           as explained above.
420         *
421         * MODIFIES: _O1 contains original _O1 + 256.
422         *           _O2 contains original _O1 + 256 - 64.
423         *           VR0..VR19 loaded from memory.
424         */
425        .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
426        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
427        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428        LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
429        .endm
430
431        /*
432         * Load non-volatile vector registers v20..v31.
433         * Note that no DCBT is performed since we use
434         * DST for preloading the cache during a context
435         * switch, see
436         *
437         *    "NOTE REGARDING dcbt VS dst"
438         *
439         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
440         *           as explained above.
441         *
442         * MODIFIES: _O1 contains original _O1 + 128.
443         *           _O2 contains original _O1 + 128 - 64.
444         *           VR20..VR31 loaded from memory.
445         */
446        .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
447        L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
448        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
449        .endm
450
451        /*
452         * Load all registers from memory area.
453         */
454        .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
455        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
456        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457        L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458        L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
460        .endm
461
462        /*
463         * Compute
464         *     _B1 = _B0 + 16
465         *     _B2 = _B0 + 32
466         *     _B3 = _B0 + 48
467         * and load
468         *     _RO = 0
469         *
470         * convenience macro to be expanded before
471         * any of the load/store macros that use
472         * four base addresses etc.
473         *
474         * INPUT: _B0 = cache-aligned start of memory area
475         *
476         * MODIFIES: _B1, _B2, _B3, _RO as described above.
477         */
478        .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
479        addi       \_B1, \_B0, 1*VECSIZE
480        addi       \_B2, \_B0, 2*VECSIZE
481        addi       \_B3, \_B0, 3*VECSIZE
482        li         \_RO, 0
483        .endm
484
485        /*
486         * Prepare for saving general vector registers.
487         *
488         * If not built with #define IGNORE_VRSAVE then
489         *
490         *  1) copy vrsave to CRC
491         *
492         * endif
493         *
494         *  2) copy vrsave to _VRSAVE_REG
495         *  3) preload/zero cache line where vrsave and vscr are stored.
496         *  4) compute base adresses from _B0
497         *  5) preload/zero first two cache lines (remember that the
498         *     first S8VEC_P starts preloading/zeroing at offset 64).
499         *
500         * INPUT:    'vrsave' register, _B0 (base address of memory area)
501         * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
502         *           _B0 = original _BO + 32
503         *           _B1 = original _B0 + 32 + 16,
504         *           _B2 = original _B0 + 32 + 32,
505         *           _B3 = original _B0 + 32 + 48,
506         *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
507         */
508        .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
509        mfvrsave   \_VRSAVE_REG
510#ifndef IGNORE_VRSAVE
511        mtcr       \_VRSAVE_REG
512#endif
513        dcbz       0, \_B0
514        addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
515        dcbz       0, \_B0
516        CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
517        dcbz       0, \_B2
518        .endm
519
520        /*
521         * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
522         * must have been loaded from 'vrsave' and 'vscr', respectively,
523         * prior to expanding this macro.
524         *
525         * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
526         *           _VSCR_VREG  VR  holding 'vscr'   contents
527         *           _B0 cache-aligned (base) address of memory area.
528         * MODIFIES: _SCRATCH_REG
529         */
530        .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
531        stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
532        li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
533        stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
534        .endm
535
536        /*
537         * Load 'vrsave' and 'vscr' from memory.
538         *
539         * INPUTS:   _B0 cache-aligned (base) address of memory area.
540         * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
541         *           'vscr', 'vrsave'.
542         *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
543         *           with IGNORE_VRSAVE undefined).
544         */
545        .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
546        lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
547        mtvrsave  \_SCRATCH_REG
548#ifndef IGNORE_VRSAVE
549        mtcr      \_SCRATCH_REG
550#endif
551        li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
552        lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
553        mtvscr    \_SCRATCH_VREG
554        .endm
555
556        /*
557         * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
558         *
559         * INPUT:    _B0
560         * MODIFIES: _B0 (as stated above)
561         */
562        .macro CACHE_DOWNALGN _B0
563        rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
564        .endm
565
566        .text
567
568        .global _CPU_save_altivec_volatile
569_CPU_save_altivec_volatile:
570        /* Align address up to next cache-line boundary */
571        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
572        CACHE_DOWNALGN r3
573
574#ifndef IGNORE_VRSAVE
575        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
576         * when testing if we really should do the load/store operation.
577         */
578        mfcr      r9
579#endif
580
581        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
582        /* r0 now contains VRSAVE, r3 still the aligned memory area
583         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
584         * respectively. r10 holds zero
585         */
586        S_V0TOV19     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
587        mfvscr        v0
588        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
589        S_VSCR_VRSAVE r0, v0, r3, r11
590
591#ifndef IGNORE_VRSAVE
592        /* Restore CRC */
593        mtcr      r9
594#endif
595        blr
596
597        .global _CPU_load_altivec_volatile
598_CPU_load_altivec_volatile:
599        /* Align address up to next cache-line boundary */
600        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
601        CACHE_DOWNALGN r3
602#ifndef IGNORE_VRSAVE
603        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
604         * when testing if we really should do the load/store operation.
605         */
606        mfcr      r9
607#endif
608
609        /* Try to preload 1st line (where vscr and vrsave are stored) */
610        dcbt      0, r3
611        /* Point to start of general vector-register area             */
612        addi      r3, r3, PPC_CACHE_ALIGNMENT
613        /* Start preloading 2nd line (where first two vectors are)    */
614        dcbt      0, r3
615        L_VSCR_VRSAVE r3, r0, v0
616        CMP_BASES     r3, r4, r5, r6, r10
617        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
618        dcbt      0, r5
619        L_V0TOV19 r3, r4, r5, r6, r10, r11
620
621#ifndef IGNORE_VRSAVE
622        mtcr      r9
623#endif
624        blr
625
626        .global _CPU_Context_switch_altivec
627_CPU_Context_switch_altivec:
628
629        /* fetch offset of altivec area in context                   */
630        CMPOFF    r5
631        /* down-align 'to' area to cache-line boundary               */
632        add       r4, r4, r5
633        CACHE_DOWNALGN r4
634
635        /* Check for PSIM                                            */
636        lis       r6, _CPU_altivec_psim_cpu@ha
637        lwz       r6, _CPU_altivec_psim_cpu@l(r6)
638        cmpli     0, r6, 0
639        bne       1f
640        /* Skip data-stream instructions on PSIM (not implemented)   */
641        dssall
642        /* Pre-load new context into cache                           */
643        lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
644        ori       r6, r6, BSTRIDE
645        dstt      r4, r6, ds0
6461:
647
648#ifndef IGNORE_VRSAVE
649        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
650         * when testing if we really should do the load/store operation.
651         */
652        mfcr      r9
653#endif
654
655        /* Is 'from' context == NULL ? (then we just do a 'restore') */
656        cmpli     0, r3, 0
657        beq       1f           /* yes: skip saving 'from' context    */
658
659        /* SAVE NON-VOLATILE REGISTERS                               */
660
661        /* Compute aligned destination pointer (r5 still holds offset
662         * to 'altivec' area in context)
663         */
664        add       r3, r3, r5
665        CACHE_DOWNALGN r3
666
667        PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
668        /* The manual says reading vscr can take some time - do
669         * read it here (into a volatile vector register) while
670         * we wait for cache blocks to be allocated
671         */
672        mfvscr    v0
673        S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
674        /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
675        S_VSCR_VRSAVE r0, v0, r3, r5
676
6771:
678
679        /* LOAD NON-VOLATILE REGISTERS                               */
680
681        /* Advance past vrsave/vscr area                             */
682        addi      r4, r4, PPC_CACHE_ALIGNMENT
683        L_VSCR_VRSAVE r4, r0, v0
684        CMP_BASES r4, r5, r6, r7, r10
685        L_V20TOV31 r4, r5, r6, r7, r10, r11
686
687#ifndef IGNORE_VRSAVE
688        mtcr      r9
689#endif
690        blr
691
692        .global _CPU_Context_initialize_altivec
693_CPU_Context_initialize_altivec:
694        CMPOFF    r5
695        add       r3, r3, r5
696        CACHE_DOWNALGN r3
697        lis       r5, _CPU_altivec_vrsave_initval@ha
698        lwz       r5, _CPU_altivec_vrsave_initval@l(r5)
699        stw       r5, VRSAVE_OFF(r3)
700        lis       r6, _CPU_altivec_vscr_initval@ha
701        lwz       r6, _CPU_altivec_vscr_initval@l(r6)
702        stw       r6, VSCR_OFF(r3)
703        blr
704
705        /*
706         * Change the initial value of VRSAVE.
707         * Can be used by initialization code if
708         * it is determined that code was compiled
709         * with -mvrsave=no. In this case, VRSAVE
710         * must be set to all-ones which causes this
711         * support code to save/restore *all* registers
712         * (only has an effect if IGNORE_VRSAVE is
713         * not defined -- otherwise all registers are
714         * saved/restored anyways).
715         */
716        .global _CPU_altivec_set_vrsave_initval
717_CPU_altivec_set_vrsave_initval:
718        lis       r5, _CPU_altivec_vrsave_initval@ha
719        stw       r3, _CPU_altivec_vrsave_initval@l(r5)
720        mtvrsave  r3
721        blr
722
723#ifdef ALTIVEC_TESTING
724        .global msr_VE_on
725msr_VE_on:
726        mfmsr r3
727        oris  r3, r3, 1<<(31-6-16)
728        mtmsr r3
729        blr
730
731        .global msr_VE_off
732msr_VE_off:
733        mfmsr r3
734        lis   r4,  1<<(31-6-16)
735        andc  r3, r3, r4
736        mtmsr r3
737        blr
738
739
740        .global mfvrsave
741mfvrsave:
742        mfvrsave r3
743        blr
744
745        .global mtvrsave
746mtvrsave:
747        mtvrsave r3
748        blr
749
750        /* Load all vector registers from memory area.
751         * NOTE: This routine is not strictly ABI compliant --
752         *       it guarantees that volatile vector registers
753         *       have certain values on exit!
754         */
755        .global _CPU_altivec_load_all
756_CPU_altivec_load_all:
757        /* Align address up to next cache-line boundary */
758        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
759        CACHE_DOWNALGN r3
760#ifndef IGNORE_VRSAVE
761        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
762         * when testing if we really should do the load/store operation.
763         */
764        mfcr      r9
765#endif
766
767        /* Try to preload 1st line (where vscr and vrsave are stored) */
768        dcbt      0, r3
769        /* Point to start of general vector-register area             */
770        addi      r3, r3, PPC_CACHE_ALIGNMENT
771        /* Start preloading 2nd line (where first two vectors are)    */
772        dcbt      0, r3
773        L_VSCR_VRSAVE r3, r0, v0
774        CMP_BASES     r3, r4, r5, r6, r10
775        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
776        dcbt      0, r5
777        L_V0TOV31 r3, r4, r5, r6, r10, r11
778
779#ifndef IGNORE_VRSAVE
780        mtcr      r9
781#endif
782        blr
783
784        .global _CPU_altivec_save_all
785_CPU_altivec_save_all:
786        /* Align address up to next cache-line boundary */
787        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
788        CACHE_DOWNALGN r3
789
790#ifndef IGNORE_VRSAVE
791        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
792         * when testing if we really should do the load/store operation.
793         */
794        mfcr      r9
795#endif
796
797        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
798        /* r0 now contains VRSAVE, r3 still the aligned memory area
799         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
800         * respectively. r10 holds zero
801         */
802        S_V0TOV31     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
803        mfvscr        v0
804        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
805        S_VSCR_VRSAVE r0, v0, r3, r11
806
807#ifndef IGNORE_VRSAVE
808        /* Restore CRC */
809        mtcr      r9
810#endif
811        blr
812
813
814#if 0
815        .gnu_attribute 4,1
816        .gnu_attribute 8,1
817#endif
818
819#endif
820#endif
Note: See TracBrowser for help on using the repository browser.