source: rtems/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @ 8f511f05

4.104.115
Last change on this file since 8f511f05 was 8f511f05, checked in by Till Straumann <strauman@…>, on 12/02/09 at 01:57:45
  • added SLAC copyright disclaimer.
  • Property mode set to 100644
File size: 24.8 KB
Line 
1/* $Id$ */
2#ifdef __ALTIVEC__
3
4/* Altivec support for RTEMS; vector register context management.  */
5
6/*
7 * Authorship
8 * ----------
9 * This software was created by
10 *     Till Straumann <strauman@slac.stanford.edu>, 2009,
11 *         Stanford Linear Accelerator Center, Stanford University.
12 *
13 * Acknowledgement of sponsorship
14 * ------------------------------
15 * This software was produced by
16 *     the Stanford Linear Accelerator Center, Stanford University,
17 *         under Contract DE-AC03-76SFO0515 with the Department of Energy.
18 *
19 * Government disclaimer of liability
20 * ----------------------------------
21 * Neither the United States nor the United States Department of Energy,
22 * nor any of their employees, makes any warranty, express or implied, or
23 * assumes any legal liability or responsibility for the accuracy,
24 * completeness, or usefulness of any data, apparatus, product, or process
25 * disclosed, or represents that its use would not infringe privately owned
26 * rights.
27 *
28 * Stanford disclaimer of liability
29 * --------------------------------
30 * Stanford University makes no representations or warranties, express or
31 * implied, nor assumes any liability for the use of this software.
32 *
33 * Stanford disclaimer of copyright
34 * --------------------------------
35 * Stanford University, owner of the copyright, hereby disclaims its
36 * copyright and all other rights in this software.  Hence, anyone may
37 * freely use it for any purpose without restriction.
38 *
39 * Maintenance of notices
40 * ----------------------
41 * In the interest of clarity regarding the origin and status of this
42 * SLAC software, this and all the preceding Stanford University notices
43 * are to remain affixed to any copy or derivative of this software made
44 * or distributed by the recipient and are to be affixed to any copy of
45 * software made or distributed by the recipient that contains a copy or
46 * derivative of this software.
47 *
48 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
49 */
50
51
52#include <rtems/powerpc/powerpc.h>
53
54#ifndef PPC_CACHE_ALIGNMENT
55#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
56#endif
57
58#define ALTIVEC_TESTING
59
60#if PPC_CACHE_ALIGNMENT != 32
61#error "Altivec support assumes cache-line size is 32 bytes!"
62#else
63#undef  LD_PPC_CACHE_ALIGNMENT
64#define LD_PPC_CACHE_ALIGNMENT 5
65#endif
66
67        .set   v0,   0
68        .set   v8,   8
69        .set   v16, 16
70        .set   v20, 20
71        .set   v24, 24
72        .set   v28, 28
73
74        .set   r0,   0
75        .set   r3,   3
76        .set   r4,   4
77        .set   r5,   5
78        .set   r6,   6
79        .set   r7,   7
80
81        .set   r10, 10
82        .set   r11, 11
83        .set   r12, 12
84
85        .set   cr5,  5
86
87        .set   VECSIZE,    16
88
89        .set   VRSAVE_INIT_VAL, 0
90        .set   VSCR_INIT_VAL,   0
91
92        .set   VRSAVE_OFF, 16
93        .set   VSCR_OFF,   16+12
94
95        .set   ds0,  0
96
97        /* Block size for dst -- in units of 16-bytes */
98        .set   BSIZE,   2       /* = 32 bytes */
99        .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
100        .set   BSTRIDE, 32      /*      bytes */
101
102        .data
103
104        .global _CPU_altivec_vrsave_initval
105_CPU_altivec_vrsave_initval:
106        .long   0
107
108        .global _CPU_altivec_vscr_initval
109_CPU_altivec_vscr_initval:
110        .long   0
111
112        .text
113
114        .extern _CPU_altivec_psim_cpu
115        .extern _CPU_altivec_ctxt_off
116
117        .macro  CMPOFF _B0
118        lis     \_B0, _CPU_altivec_ctxt_off@ha
119        lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
120        .endm
121
122        /* Conditionally load or store a vector _VR to
123     *  EA(_R1|0 + _R2)
124         * If bit _VR (corresponding to _VR) is set in CRC
125         * then the load/store is performed but otherwise
126         * it is skipped.
127         * If compiled with IGNORE_VRSAVE defined then
128         * the load/store is done unconditionally.
129         *
130         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
131         * _VR    : target vector register
132         * _R1    : base register (NOTE: _R1=r0 uses a
133         *          implicit ZERO constant, not the contents
134         *          of r0) for address computation.
135         * _R2    : 'offset' register for address computation.
136         *
137         * MODIFIES:      _VR on output if a load operation is performed.
138         * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
139         *                defined.
140         */
141        .macro LDST _OPCODE, _VR, _R1, _R2
142#ifndef IGNORE_VRSAVE
143        bc       4, \_VR, 111f
144#endif
145        \_OPCODE \_VR, \_R1, \_R2
146111:
147        .endm
148
149        /*
150         * Load or store four 'adjacent' vector registers.
151         *
152         * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
153         * _VR    : target vector register
154         * _R1    : base register (NOTE: _R1=r0 uses a
155         *          implicit ZERO constant, not the contents
156         *          of r0) for address computation.
157         * _B0    : base register 0
158         * _B1    : base register 1
159         * _B2    : base register 2
160         * _B3    : base register 3
161         * _RO    : offset register
162         *
163         * memory addresses for _VR, _VR+1, _VR+2, _VR+3
164         * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
165         *
166         * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
167         *                operation is performed.
168         * IMPLICIT USE:  see LDST
169         */
170        .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
171        LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
172        LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
173        LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
174        LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
175        .endm
176
177        /*
178         * Preload/zero two cache lines and save 4 vector registers
179         * to memory.
180         * Note that the cache operation targets memory *past* the
181         * current storage area which should hopefully hit when
182         * This same code is executed on the next two cache lines...
183         *
184         * This code effectively does
185         *   dcbz (_B0 + 64)
186         *   dcbz (_B0 + 64 + 32)
187         *   stvx _VF+0, (_B0+ 0)
188         *   stvx _VF+1, (_B0+16)
189         *   stvx _VF+2, (_B0+32)
190         *   stvx _VF+3, (_B0+48)
191         *
192         * _LRU:  may be 'l' or empty. The former variant should be
193         *        used when it is conceivable that the memory area is
194         *        unlikely to be used in the near future thus making
195         *        it a candidate for early eviction from the caches.
196         *
197         *        If it is likely that the memory area is reused soon
198         *        (e.g., save/restore across ISR execution) then the
199         *        'stvx' opcode (w/o 'l' suffix) should be used.
200         *
201         * _VR:   first of four target vector registers; _VR+0,
202         *        _VR+1, _VR+2, _VR+3 are saved.
203         *
204         * _BO:   base address of memory area.
205         * _B1:   should contain _B0+16 on entry
206         * _B2:   should contain _B0+32 on entry
207         * _B3:   should contain _B0+48 on entry
208         *
209         * _O1:   contains the offset where the four vectors are
210         *        stored.
211         *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
212         *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
213         *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
214         *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
215         * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
216     *        used to address the two cache-lines past the
217         *        current memory area.
218         *
219         * MODIFIES: _O2; contains _O1 + 64 after execution of this
220         *        code.
221         *
222         * NOTES: a different set of four vectors can be addressed
223         *        simply by changing the one offset register _O1.
224         *
225         *        Saving more than 4 registers can simply be
226         *        achieved by expanding this macro multiple
227         *        times with _O1 and _O2 swapped (new _O1
228         *        becomes _O2 = old _O1 + 64) thus stepping
229         *        through the memory area.
230         *
231         */
232        .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
233        addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
234        dcbz  \_B0, \_O2
235        dcbz  \_B2, \_O2
236        LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
237        .endm
238
239        /*
240         * Save eight vector registers by expanding S4VEC_P twice.
241         * See notes for S4VEC_P above.
242         *
243         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
244         *
245         * MODIFIES: After execution,
246         *           _O2 contains original _O1 +  64,
247         *           _O1 contains original _O1 + 128
248         *
249         * NOTES:    Expanding this macro multiple times lets you save
250         *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
251         */
252        .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
253        S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
254        /* Note that the roles of _O1 and _O2 are swapped here */
255        S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
256        .endm
257
258        /*
259         * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
260         *
261         * See notes above (for S4VEC_P).
262         *
263         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
264         * MODIFIES: _O1 contains original _O1 + 256
265         *           _O2 contains original _O1 + 256 - 64
266         */
267        .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
268        S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
269        S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
270        LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
271        .endm
272
273        /*
274         * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
275         *
276         * See notes above (for S4VEC_P, S_V0TOV19).
277         *
278         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
279         * MODIFIES: _O1 contains original _O1 + 128
280         *           _O2 contains original _O1 + 128 - 64
281         */
282        .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
283        S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
284        LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
285        .endm
286
287        /*
288         * Save all registers to memory area
289         *
290         * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
291         * MODIFIES: _O1 contains original _O1 + 512
292         *           _O2 contains original _O1 + 512 - 64
293         */
294        .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
295        S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
296        S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
297        S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
298        S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
299        LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
300        .endm
301
302
303        /*
304         * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
305         * We can pass either of them as arguments to another macro which
306         * allows us to decide if the main macro uses dcbt or not when
307         * we expand it...
308         */
309        .macro DO_DCBT _RA, _RB
310        dcbt \_RA, \_RB
311        .endm
312
313        .macro NO_DCBT _RA, _RB
314        .endm
315
316        /*
317         * NOTE REGARDING dcbt VS dst
318         *
319         * Preloading the cache with memory areas that we soon need
320         * can be done either using 'dcbt' or 'dst' instructions
321         * "ahead of time".
322         * When experimenting (on a mpc7457) I found that the 'dst'
323         * stream instruction was very efficient if there is enough
324         * time to read ahead. It works well when we do a context
325         * switch:
326         *
327         *   1) start DST on new context to be loaded
328         *   2) save old context to memory
329         *   3) load new context from memory
330         *
331         * Because of the interleaved step 2) dst works nicely and
332         * 3) finds what it needs in the cache.
333         *
334         * However, in a situation when there is not much time
335         * to start the DST, e.g., because we want to restore
336         * a context out of the blue (e.g., after returning
337         * from and ISR):
338         *
339         *   1) save volatile registers to memory/stack
340         *   2) execute ISR
341         *   3) might do a task context switch
342         *   4) when returned to old task context then
343         *      reload volatile registers from memory/stack.
344         *
345         * In this situation, preloading the target memory before
346         * or after step 1) makes obviously no sense because after
347         * 1) the registers area is most likely in the cache already.
348         *
349         * Starting preload after 2) doesn't make much sense either.
350         * If ISR doesn't lead to a context switch then it is quite
351         * likely that the register area is still in the cache.
352         * OTOTH, if a context switch happens then the preload after 2)
353         * might be useless.
354         *
355         * This leaves us at step 4) where we want to load immediately.
356         * In this case, I found that 'dcbt' works more efficiently
357         * so that's what we use when restoring volatile registers.
358         *
359         * When restoring the non-volatile VRs during a 'normal'
360         * context switch then we shall use DST (and no dcbt).
361         */
362
363        /*
364         * Symmetric to S4VEC_P above but addresses loading four
365         * vector registers from memory.
366         *
367         * Touches two cache lines past the current memory area
368         * and loads four vectors from the current area.
369         *
370         * Optionally, the DCBT operation may be omitted
371         * (when expanding with _DCBT=NO_DCBT).
372         * This is useful if the cache was already preloaded
373         * by another means (dst instruction).
374         *
375         * NOTE: We always use the 'LRU' form of lvx: lvxl,
376         *       because we deem it unlikely that the context
377         *       that was just loaded has to be saved again
378         *       to memory in the immediate future.
379         *
380         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
381         *           as explained above.
382         *
383         * MODIFIES: _O2 contains original _O1 + 64.
384         *           _VR.._VR+3 loaded from memory.
385         */
386        .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
387        addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
388        /* preload/touch 2 lines at offset 64 from _B0 */
389        \_DCBT   \_B0, \_O2
390        \_DCBT   \_B2, \_O2
391        /* load four vectors at off set 0 from _B0     */
392        LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
393        .endm
394
395        /*
396         * Symmetric to S8VEC_P; loads 8 vector registers
397         * from memory -- see comments above...
398         *
399         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
400         *           as explained above.
401         *
402         * MODIFIES: _O1 contains original _O1 + 128.
403         *           _O2 contains original _O1 +  64.
404         *           _VR.._VR+7 loaded from memory.
405         */
406        .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
407        L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
408        L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
409        .endm
410       
411        /*
412         * Load volatile vector registers v0..v19 employing
413         * the DCBT to preload the cache. The rationale for
414         * using DCBT here but not when restoring non-volatile
415         * registers is explained above, see
416         *
417         *    "NOTE REGARDING dcbt VS dst"
418         *
419         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
420         *           as explained above.
421         *
422         * MODIFIES: _O1 contains original _O1 + 256.
423         *           _O2 contains original _O1 + 256 - 64.
424         *           VR0..VR19 loaded from memory.
425         */
426        .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
427        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
428        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
429        LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
430        .endm
431
432        /*
433         * Load non-volatile vector registers v20..v31.
434         * Note that no DCBT is performed since we use
435         * DST for preloading the cache during a context
436         * switch, see
437         *
438         *    "NOTE REGARDING dcbt VS dst"
439         *
440         * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
441         *           as explained above.
442         *
443         * MODIFIES: _O1 contains original _O1 + 128.
444         *           _O2 contains original _O1 + 128 - 64.
445         *           VR20..VR31 loaded from memory.
446         */
447        .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
448        L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
449        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
450        .endm
451
452        /*
453         * Load all registers from memory area.
454         */
455        .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
456        L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
457        L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
458        L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
459        L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
460        LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
461        .endm
462
463        /*
464         * Compute
465         *     _B1 = _B0 + 16
466         *     _B2 = _B0 + 32
467         *     _B3 = _B0 + 48
468         * and load
469         *     _RO = 0
470         *
471         * convenience macro to be expanded before
472         * any of the load/store macros that use
473         * four base addresses etc.
474         *
475         * INPUT: _B0 = cache-aligned start of memory area
476         *
477         * MODIFIES: _B1, _B2, _B3, _RO as described above.
478         */
479        .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
480        addi       \_B1, \_B0, 1*VECSIZE
481        addi       \_B2, \_B0, 2*VECSIZE
482        addi       \_B3, \_B0, 3*VECSIZE
483        li         \_RO, 0
484        .endm
485
486        /*
487         * Prepare for saving general vector registers.
488         *
489         * If not built with #define IGNORE_VRSAVE then
490         *
491         *  1) copy vrsave to CRC
492         *
493         * endif
494         *
495         *  2) copy vrsave to _VRSAVE_REG
496         *  3) preload/zero cache line where vrsave and vscr are stored.
497         *  4) compute base adresses from _B0
498         *  5) preload/zero first two cache lines (remember that the
499         *     first S8VEC_P starts preloading/zeroing at offset 64).
500         *
501         * INPUT:    'vrsave' register, _B0 (base address of memory area)
502         * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
503         *           _B0 = original _BO + 32
504         *           _B1 = original _B0 + 32 + 16,
505         *           _B2 = original _B0 + 32 + 32,
506         *           _B3 = original _B0 + 32 + 48,
507         *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
508         */
509        .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
510        mfvrsave   \_VRSAVE_REG
511#ifndef IGNORE_VRSAVE
512        mtcr       \_VRSAVE_REG
513#endif
514        dcbz       0, \_B0
515        addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
516        dcbz       0, \_B0
517        CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
518        dcbz       0, \_B2
519        .endm
520
521        /*
522         * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
523         * must have been loaded from 'vrsave' and 'vscr', respectively,
524         * prior to expanding this macro.
525         *
526         * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
527         *           _VSCR_VREG  VR  holding 'vscr'   contents
528         *           _B0 cache-aligned (base) address of memory area.
529         * MODIFIES: _SCRATCH_REG
530         */
531        .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
532        stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
533        li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
534        stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
535        .endm
536
537        /*
538         * Load 'vrsave' and 'vscr' from memory.
539         *
540         * INPUTS:   _B0 cache-aligned (base) address of memory area.
541         * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
542         *           'vscr', 'vrsave'.
543         *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
544         *           with IGNORE_VRSAVE undefined).
545         */
546        .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
547        lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
548        mtvrsave  \_SCRATCH_REG
549#ifndef IGNORE_VRSAVE
550        mtcr      \_SCRATCH_REG
551#endif
552        li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
553        lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
554        mtvscr    \_SCRATCH_VREG
555        .endm
556
557        /*
558         * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
559         *
560         * INPUT:    _B0
561         * MODIFIES: _B0 (as stated above)
562         */
563        .macro CACHE_DOWNALGN _B0
564        rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
565        .endm
566
567        .text
568
569        .global _CPU_save_altivec_volatile
570_CPU_save_altivec_volatile:
571        /* Align address up to next cache-line boundary */
572        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
573        CACHE_DOWNALGN r3
574
575#ifndef IGNORE_VRSAVE
576        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
577         * when testing if we really should do the load/store operation.
578         */
579        mfcr      r12
580#endif
581
582        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
583        /* r0 now contains VRSAVE, r3 still the aligned memory area
584         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
585         * respectively. r10 holds zero
586         */
587        S_V0TOV19     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
588        mfvscr        v0
589        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
590        S_VSCR_VRSAVE r0, v0, r3, r11
591
592#ifndef IGNORE_VRSAVE
593        /* Restore CRC */
594        mtcr      r12
595#endif
596        blr
597
598        .global _CPU_load_altivec_volatile
599_CPU_load_altivec_volatile:
600        /* Align address up to next cache-line boundary */
601        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
602        CACHE_DOWNALGN r3
603#ifndef IGNORE_VRSAVE
604        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
605         * when testing if we really should do the load/store operation.
606         */
607        mfcr      r12
608#endif
609
610        /* Try to preload 1st line (where vscr and vrsave are stored) */
611        dcbt      0, r3
612        /* Point to start of general vector-register area             */
613        addi      r3, r3, PPC_CACHE_ALIGNMENT
614        /* Start preloading 2nd line (where first two vectors are)    */
615        dcbt      0, r3
616        L_VSCR_VRSAVE r3, r0, v0
617        CMP_BASES     r3, r4, r5, r6, r10
618        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
619        dcbt      0, r5
620        L_V0TOV19 r3, r4, r5, r6, r10, r11
621
622#ifndef IGNORE_VRSAVE
623        mtcr      r12
624#endif
625        blr
626
627        .global _CPU_Context_restore_altivec
628_CPU_Context_restore_altivec:
629        /* Restore is like 'switch' but we don't have
630         * to save an old context.
631         * Move argument to second arg and load NULL pointer
632         * to first one, then jump to 'switch' routine.
633         */
634        mr      r4, r3
635        li  r3,  0
636        b _CPU_Context_switch_altivec
637
638        .global _CPU_Context_switch_altivec
639_CPU_Context_switch_altivec:
640
641        /* fetch offset of altivec area in context                   */
642        CMPOFF    r5
643        /* down-align 'to' area to cache-line boundary               */
644        add       r4, r4, r5
645        CACHE_DOWNALGN r4
646
647        /* Check for PSIM                                            */
648        lis       r6, _CPU_altivec_psim_cpu@ha
649        lwz       r6, _CPU_altivec_psim_cpu@l(r6)
650        cmpli     0, r6, 0
651        bne       1f
652        /* Skip data-stream instructions on PSIM (not implemented)   */
653        dssall
654        /* Pre-load new context into cache                           */
655        lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
656        ori       r6, r6, BSTRIDE
657        dstt      r4, r6, ds0
6581:
659
660#ifndef IGNORE_VRSAVE
661        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
662         * when testing if we really should do the load/store operation.
663         */
664        mfcr      r12
665#endif
666
667        /* Is 'from' context == NULL ? (then we just do a 'restore') */
668        cmpli     0, r3, 0
669        beq       1f           /* yes: skip saving 'from' context    */
670
671        /* SAVE NON-VOLATILE REGISTERS                               */
672
673        /* Compute aligned destination pointer (r5 still holds offset
674         * to 'altivec' area in context)
675         */
676        add       r3, r3, r5
677        CACHE_DOWNALGN r3
678
679        PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
680        /* The manual says reading vscr can take some time - do
681         * read it here (into a volatile vector register) while
682         * we wait for cache blocks to be allocated
683         */
684        mfvscr    v0
685        S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
686        /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
687        S_VSCR_VRSAVE r0, v0, r3, r5
688
6891:
690
691        /* LOAD NON-VOLATILE REGISTERS                               */
692
693        /* Advance past vrsave/vscr area                             */
694        addi      r4, r4, PPC_CACHE_ALIGNMENT
695        L_VSCR_VRSAVE r4, r0, v0
696        CMP_BASES r4, r5, r6, r7, r10
697        L_V20TOV31 r4, r5, r6, r7, r10, r11
698
699#ifndef IGNORE_VRSAVE
700        mtcr      r12
701#endif
702        blr
703
704        .global _CPU_Context_initialize_altivec
705_CPU_Context_initialize_altivec:
706        CMPOFF    r5
707        add       r3, r3, r5
708        CACHE_DOWNALGN r3
709        lis       r5, _CPU_altivec_vrsave_initval@ha
710        lwz       r5, _CPU_altivec_vrsave_initval@l(r5)
711        stw       r5, VRSAVE_OFF(r3)
712        lis       r6, _CPU_altivec_vscr_initval@ha
713        lwz       r6, _CPU_altivec_vscr_initval@l(r6)
714        stw       r6, VSCR_OFF(r3)
715        blr
716
717        /*
718         * Change the initial value of VRSAVE.
719         * Can be used by initialization code if
720         * it is determined that code was compiled
721         * with -mvrsave=no. In this case, VRSAVE
722         * must be set to all-ones which causes this
723         * support code to save/restore *all* registers
724         * (only has an effect if IGNORE_VRSAVE is
725         * not defined -- otherwise all registers are
726         * saved/restored anyways).
727         */
728        .global _CPU_altivec_set_vrsave_initval
729_CPU_altivec_set_vrsave_initval:
730        lis       r5, _CPU_altivec_vrsave_initval@ha
731        stw       r3, _CPU_altivec_vrsave_initval@l(r5)
732        mtvrsave  r3
733        blr
734
735#ifdef ALTIVEC_TESTING
736        .global msr_VE_on
737msr_VE_on:
738        mfmsr r3
739        oris  r3, r3, 1<<(31-6-16)
740        mtmsr r3
741        blr
742
743        .global msr_VE_off
744msr_VE_off:
745        mfmsr r3
746        lis   r4,  1<<(31-6-16)
747        andc  r3, r3, r4
748        mtmsr r3
749        blr
750
751
752        .global mfvrsave
753mfvrsave:
754        mfvrsave r3
755        blr
756
757        .global mtvrsave
758mtvrsave:
759        mtvrsave r3
760        blr
761
762        /* Load all vector registers from memory area.
763         * NOTE: This routine is not strictly ABI compliant --
764         *       it guarantees that volatile vector registers
765         *       have certain values on exit!
766         */
767        .global _CPU_altivec_load_all
768_CPU_altivec_load_all:
769        /* Align address up to next cache-line boundary */
770        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
771        CACHE_DOWNALGN r3
772#ifndef IGNORE_VRSAVE
773        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
774         * when testing if we really should do the load/store operation.
775         */
776        mfcr      r12
777#endif
778
779        /* Try to preload 1st line (where vscr and vrsave are stored) */
780        dcbt      0, r3
781        /* Point to start of general vector-register area             */
782        addi      r3, r3, PPC_CACHE_ALIGNMENT
783        /* Start preloading 2nd line (where first two vectors are)    */
784        dcbt      0, r3
785        L_VSCR_VRSAVE r3, r0, v0
786        CMP_BASES     r3, r4, r5, r6, r10
787        /* Start preloading 3rd line (where vectors 3 and 4 are)      */
788        dcbt      0, r5
789        L_V0TOV31 r3, r4, r5, r6, r10, r11
790
791#ifndef IGNORE_VRSAVE
792        mtcr      r12
793#endif
794        blr
795
796        .global _CPU_altivec_save_all
797_CPU_altivec_save_all:
798        /* Align address up to next cache-line boundary */
799        addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
800        CACHE_DOWNALGN r3
801
802#ifndef IGNORE_VRSAVE
803        /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
804         * when testing if we really should do the load/store operation.
805         */
806        mfcr      r12
807#endif
808
809        PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
810        /* r0 now contains VRSAVE, r3 still the aligned memory area
811         * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
812         * respectively. r10 holds zero
813         */
814        S_V0TOV31     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
815        mfvscr        v0
816        /* Store vrsave (still in r0) and vscr (in v0) to memory area */
817        S_VSCR_VRSAVE r0, v0, r3, r11
818
819#ifndef IGNORE_VRSAVE
820        /* Restore CRC */
821        mtcr      r12
822#endif
823        blr
824
825
826#if 0
827        .gnu_attribute 4,1
828        .gnu_attribute 8,1
829#endif
830
831#endif
832#endif
Note: See TracBrowser for help on using the repository browser.