#ifdef __ALTIVEC__

/* Altivec support for RTEMS; vector register context management.  */

/*
 * Authorship
 * ----------
 * This software was created by
 *     Till Straumann <strauman@slac.stanford.edu>, 2009,
 * 	   Stanford Linear Accelerator Center, Stanford University.
 *
 * Acknowledgement of sponsorship
 * ------------------------------
 * This software was produced by
 *     the Stanford Linear Accelerator Center, Stanford University,
 * 	   under Contract DE-AC03-76SFO0515 with the Department of Energy.
 *
 * Government disclaimer of liability
 * ----------------------------------
 * Neither the United States nor the United States Department of Energy,
 * nor any of their employees, makes any warranty, express or implied, or
 * assumes any legal liability or responsibility for the accuracy,
 * completeness, or usefulness of any data, apparatus, product, or process
 * disclosed, or represents that its use would not infringe privately owned
 * rights.
 *
 * Stanford disclaimer of liability
 * --------------------------------
 * Stanford University makes no representations or warranties, express or
 * implied, nor assumes any liability for the use of this software.
 *
 * Stanford disclaimer of copyright
 * --------------------------------
 * Stanford University, owner of the copyright, hereby disclaims its
 * copyright and all other rights in this software.  Hence, anyone may
 * freely use it for any purpose without restriction.
 *
 * Maintenance of notices
 * ----------------------
 * In the interest of clarity regarding the origin and status of this
 * SLAC software, this and all the preceding Stanford University notices
 * are to remain affixed to any copy or derivative of this software made
 * or distributed by the recipient and are to be affixed to any copy of
 * software made or distributed by the recipient that contains a copy or
 * derivative of this software.
 *
 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
 */


#include <rtems/powerpc/powerpc.h>

#ifndef PPC_CACHE_ALIGNMENT
#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
#endif

#define ALTIVEC_TESTING

#if PPC_CACHE_ALIGNMENT != 32
#error "Altivec support assumes cache-line size is 32 bytes!"
#else
#undef  LD_PPC_CACHE_ALIGNMENT
#define LD_PPC_CACHE_ALIGNMENT 5
#endif

	.set   v0,   0
	.set   v8,   8
	.set   v16, 16
	.set   v20, 20
	.set   v24, 24
	.set   v28, 28

	.set   r0,   0
	.set   r3,   3
	.set   r4,   4
	.set   r5,   5
	.set   r6,   6
	.set   r7,   7
	.set   r9,   9
	.set   r10, 10
	.set   r11, 11
	/* Do not use r12, since this is used by _CPU_Context_switch() */

	.set   cr5,  5

	.set   VECSIZE,    16

	.set   VRSAVE_INIT_VAL, 0
	.set   VSCR_INIT_VAL,   0

	.set   VRSAVE_OFF, 16
	.set   VSCR_OFF,   16+12

	.set   ds0,  0

	/* Block size for dst -- in units of 16-bytes */
	.set   BSIZE,   2       /* = 32 bytes */
	.set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
	.set   BSTRIDE, 32      /*      bytes */

	.data

	.global _CPU_altivec_vrsave_initval
_CPU_altivec_vrsave_initval:
	.long   0

	.global _CPU_altivec_vscr_initval
_CPU_altivec_vscr_initval:
	.long   0

	.text

	.extern _CPU_altivec_psim_cpu
	.extern _CPU_altivec_ctxt_off

	.macro  CMPOFF _B0
	lis	\_B0, _CPU_altivec_ctxt_off@ha
	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
	.endm

	/* Conditionally load or store a vector _VR to
     *  EA(_R1|0 + _R2)
	 * If bit _VR (corresponding to _VR) is set in CRC
	 * then the load/store is performed but otherwise
	 * it is skipped.
	 * If compiled with IGNORE_VRSAVE defined then
	 * the load/store is done unconditionally.
	 *
 	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
	 * _VR    : target vector register
	 * _R1    : base register (NOTE: _R1=r0 uses a
	 *          implicit ZERO constant, not the contents
	 *          of r0) for address computation.
	 * _R2    : 'offset' register for address computation.
	 *
	 * MODIFIES:      _VR on output if a load operation is performed.
	 * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
	 *                defined.
	 */
	.macro LDST _OPCODE, _VR, _R1, _R2
#ifndef IGNORE_VRSAVE
	bc       4, \_VR, 111f
#endif
	\_OPCODE \_VR, \_R1, \_R2
111:
	.endm

	/*
	 * Load or store four 'adjacent' vector registers.
	 *
	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
	 * _VR    : target vector register
	 * _R1    : base register (NOTE: _R1=r0 uses a
	 *          implicit ZERO constant, not the contents
	 *          of r0) for address computation.
	 * _B0    : base register 0
	 * _B1    : base register 1
	 * _B2    : base register 2
	 * _B3    : base register 3
	 * _RO    : offset register
	 *
 	 * memory addresses for _VR, _VR+1, _VR+2, _VR+3
	 * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
	 *
	 * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
	 *                operation is performed.
	 * IMPLICIT USE:  see LDST
	 */
	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
	.endm

	/*
	 * Preload/zero two cache lines and save 4 vector registers
	 * to memory.
 	 * Note that the cache operation targets memory *past* the
	 * current storage area which should hopefully hit when
	 * This same code is executed on the next two cache lines...
	 *
	 * This code effectively does
	 *   dcbz (_B0 + 64)
	 *   dcbz (_B0 + 64 + 32)
	 *   stvx _VF+0, (_B0+ 0)
	 *   stvx _VF+1, (_B0+16)
	 *   stvx _VF+2, (_B0+32)
	 *   stvx _VF+3, (_B0+48)
	 *
	 * _LRU:  may be 'l' or empty. The former variant should be
	 *        used when it is conceivable that the memory area is
	 *        unlikely to be used in the near future thus making
	 *        it a candidate for early eviction from the caches.
	 *
	 *        If it is likely that the memory area is reused soon
	 *        (e.g., save/restore across ISR execution) then the
	 *        'stvx' opcode (w/o 'l' suffix) should be used.
	 *
 	 * _VR:   first of four target vector registers; _VR+0,
	 *        _VR+1, _VR+2, _VR+3 are saved.
	 *
	 * _BO:   base address of memory area.
	 * _B1:   should contain _B0+16 on entry
	 * _B2:   should contain _B0+32 on entry
	 * _B3:   should contain _B0+48 on entry
	 *
	 * _O1:   contains the offset where the four vectors are
	 *        stored. 
	 *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
	 *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
	 *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
	 *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
	 * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
     *        used to address the two cache-lines past the
	 *        current memory area.
	 *
 	 * MODIFIES: _O2; contains _O1 + 64 after execution of this
	 *        code.
	 *
	 * NOTES: a different set of four vectors can be addressed
	 *        simply by changing the one offset register _O1.
	 *
	 *        Saving more than 4 registers can simply be
	 *        achieved by expanding this macro multiple 
	 *        times with _O1 and _O2 swapped (new _O1 
	 *        becomes _O2 = old _O1 + 64) thus stepping
	 *        through the memory area.
	 *
	 */
	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
	addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
	dcbz  \_B0, \_O2
	dcbz  \_B2, \_O2
	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
	.endm

	/*
	 * Save eight vector registers by expanding S4VEC_P twice.
	 * See notes for S4VEC_P above.
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
	 *
	 * MODIFIES: After execution, 
	 *           _O2 contains original _O1 +  64,
	 *           _O1 contains original _O1 + 128
	 *
	 * NOTES:    Expanding this macro multiple times lets you save
	 *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
	 */
	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
	/* Note that the roles of _O1 and _O2 are swapped here */
	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
	.endm

	/*
	 * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
	 *
	 * See notes above (for S4VEC_P).
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
	 * MODIFIES: _O1 contains original _O1 + 256
	 *           _O2 contains original _O1 + 256 - 64
	 */
	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
	S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
	S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
	.endm

	/*
	 * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
	 *
	 * See notes above (for S4VEC_P, S_V0TOV19).
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
	 * MODIFIES: _O1 contains original _O1 + 128
	 *           _O2 contains original _O1 + 128 - 64
	 */
	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
	S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
	.endm

	/*
	 * Save all registers to memory area
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
	 * MODIFIES: _O1 contains original _O1 + 512
	 *           _O2 contains original _O1 + 512 - 64
	 */
	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
	S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
	S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
	S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
	S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
	LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
	.endm


	/*
	 * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
	 * We can pass either of them as arguments to another macro which
	 * allows us to decide if the main macro uses dcbt or not when
	 * we expand it...
	 */
	.macro DO_DCBT _RA, _RB
	dcbt \_RA, \_RB
	.endm

	.macro NO_DCBT _RA, _RB
	.endm

	/*
	 * NOTE REGARDING dcbt VS dst
	 *
	 * Preloading the cache with memory areas that we soon need
	 * can be done either using 'dcbt' or 'dst' instructions
	 * "ahead of time".
	 * When experimenting (on a mpc7457) I found that the 'dst'
	 * stream instruction was very efficient if there is enough
	 * time to read ahead. It works well when we do a context
	 * switch:
	 *
	 *   1) start DST on new context to be loaded
	 *   2) save old context to memory
	 *   3) load new context from memory
	 *
	 * Because of the interleaved step 2) dst works nicely and
	 * 3) finds what it needs in the cache.
	 *
	 * However, in a situation when there is not much time
	 * to start the DST, e.g., because we want to restore
	 * a context out of the blue (e.g., after returning
	 * from and ISR):
	 *
	 *   1) save volatile registers to memory/stack
	 *   2) execute ISR
	 *   3) might do a task context switch
	 *   4) when returned to old task context then
	 *      reload volatile registers from memory/stack.
	 *
	 * In this situation, preloading the target memory before
	 * or after step 1) makes obviously no sense because after
	 * 1) the registers area is most likely in the cache already.
	 *
	 * Starting preload after 2) doesn't make much sense either.
	 * If ISR doesn't lead to a context switch then it is quite
	 * likely that the register area is still in the cache.
	 * OTOTH, if a context switch happens then the preload after 2)
	 * might be useless.
	 * 
	 * This leaves us at step 4) where we want to load immediately.
	 * In this case, I found that 'dcbt' works more efficiently
	 * so that's what we use when restoring volatile registers.
	 *
	 * When restoring the non-volatile VRs during a 'normal'
	 * context switch then we shall use DST (and no dcbt).
	 */

	/*
	 * Symmetric to S4VEC_P above but addresses loading four
	 * vector registers from memory.
	 *
	 * Touches two cache lines past the current memory area
	 * and loads four vectors from the current area.
	 *
	 * Optionally, the DCBT operation may be omitted
	 * (when expanding with _DCBT=NO_DCBT).
	 * This is useful if the cache was already preloaded
	 * by another means (dst instruction).
	 *
	 * NOTE: We always use the 'LRU' form of lvx: lvxl, 
	 *       because we deem it unlikely that the context
	 *       that was just loaded has to be saved again
	 *       to memory in the immediate future.
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
	 *           as explained above.
	 *
	 * MODIFIES: _O2 contains original _O1 + 64.
	 *           _VR.._VR+3 loaded from memory.
	 */
	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
	addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
	/* preload/touch 2 lines at offset 64 from _B0 */
	\_DCBT   \_B0, \_O2
	\_DCBT   \_B2, \_O2
	/* load four vectors at off set 0 from _B0     */
	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
	.endm

	/*
	 * Symmetric to S8VEC_P; loads 8 vector registers
	 * from memory -- see comments above...
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
	 *           as explained above.
	 *
	 * MODIFIES: _O1 contains original _O1 + 128.
	 *           _O2 contains original _O1 +  64.
	 *           _VR.._VR+7 loaded from memory.
	 */
	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
	.endm
	
	/*
	 * Load volatile vector registers v0..v19 employing
	 * the DCBT to preload the cache. The rationale for
	 * using DCBT here but not when restoring non-volatile
	 * registers is explained above, see
	 *
	 *    "NOTE REGARDING dcbt VS dst"
	 * 
	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
	 *           as explained above.
	 *
	 * MODIFIES: _O1 contains original _O1 + 256.
	 *           _O2 contains original _O1 + 256 - 64.
	 *           VR0..VR19 loaded from memory.
	 */ 
	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
	.endm

	/*
	 * Load non-volatile vector registers v20..v31.
	 * Note that no DCBT is performed since we use
	 * DST for preloading the cache during a context
	 * switch, see
	 *
	 *    "NOTE REGARDING dcbt VS dst"
	 *
	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
	 *           as explained above.
	 *
	 * MODIFIES: _O1 contains original _O1 + 128.
	 *           _O2 contains original _O1 + 128 - 64.
	 *           VR20..VR31 loaded from memory.
	 */
	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
	L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
	.endm

	/*
	 * Load all registers from memory area.
	 */
	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
	.endm

	/*
	 * Compute
	 *     _B1 = _B0 + 16
	 *     _B2 = _B0 + 32
	 *     _B3 = _B0 + 48
	 * and load
	 *     _RO = 0
	 *
	 * convenience macro to be expanded before
	 * any of the load/store macros that use
	 * four base addresses etc.
	 *
	 * INPUT: _B0 = cache-aligned start of memory area
	 *
	 * MODIFIES: _B1, _B2, _B3, _RO as described above.
	 */
	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
	addi       \_B1, \_B0, 1*VECSIZE
	addi       \_B2, \_B0, 2*VECSIZE
	addi       \_B3, \_B0, 3*VECSIZE
	li         \_RO, 0
	.endm

	/*
	 * Prepare for saving general vector registers.
	 *
	 * If not built with #define IGNORE_VRSAVE then
	 *
	 *  1) copy vrsave to CRC
	 *
	 * endif
	 *
	 *  2) copy vrsave to _VRSAVE_REG
	 *  3) preload/zero cache line where vrsave and vscr are stored.
	 *  4) compute base adresses from _B0
	 *  5) preload/zero first two cache lines (remember that the
	 *     first S8VEC_P starts preloading/zeroing at offset 64).
	 *
	 * INPUT:    'vrsave' register, _B0 (base address of memory area)
	 * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
	 *           _B0 = original _BO + 32
	 *           _B1 = original _B0 + 32 + 16,
	 *           _B2 = original _B0 + 32 + 32,
	 *           _B3 = original _B0 + 32 + 48,
	 *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
	 */
	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
	mfvrsave   \_VRSAVE_REG
#ifndef IGNORE_VRSAVE
	mtcr       \_VRSAVE_REG
#endif
	dcbz       0, \_B0
	addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
	dcbz       0, \_B0
	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
	dcbz       0, \_B2
	.endm

	/*
	 * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
	 * must have been loaded from 'vrsave' and 'vscr', respectively,
	 * prior to expanding this macro.
	 *
	 * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
	 *           _VSCR_VREG  VR  holding 'vscr'   contents
	 *           _B0 cache-aligned (base) address of memory area.
	 * MODIFIES: _SCRATCH_REG
	 */
	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
	stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
	li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
	stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
	.endm

	/*
	 * Load 'vrsave' and 'vscr' from memory.
	 *
	 * INPUTS:   _B0 cache-aligned (base) address of memory area.
	 * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
	 *           'vscr', 'vrsave'.
	 *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
	 *           with IGNORE_VRSAVE undefined).
	 */
	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
	lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
	mtvrsave  \_SCRATCH_REG
#ifndef IGNORE_VRSAVE
	mtcr      \_SCRATCH_REG
#endif
	li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
	lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
	mtvscr    \_SCRATCH_VREG
	.endm

	/*
	 * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
	 *
	 * INPUT:    _B0
	 * MODIFIES: _B0 (as stated above)
	 */
	.macro CACHE_DOWNALGN _B0
	rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
	.endm

	.text

	.global _CPU_save_altivec_volatile
_CPU_save_altivec_volatile:
	/* Align address up to next cache-line boundary */
	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
	CACHE_DOWNALGN r3

#ifndef IGNORE_VRSAVE
	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
	 * when testing if we really should do the load/store operation.
	 */
	mfcr      r9
#endif

	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
	/* r0 now contains VRSAVE, r3 still the aligned memory area
	 * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
	 * respectively. r10 holds zero
	 */
	S_V0TOV19     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
	mfvscr        v0
	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
	S_VSCR_VRSAVE r0, v0, r3, r11

#ifndef IGNORE_VRSAVE
	/* Restore CRC */
	mtcr      r9
#endif
	blr

	.global _CPU_load_altivec_volatile
_CPU_load_altivec_volatile:
	/* Align address up to next cache-line boundary */
	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
	CACHE_DOWNALGN r3
#ifndef IGNORE_VRSAVE
	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
	 * when testing if we really should do the load/store operation.
	 */
	mfcr      r9
#endif

	/* Try to preload 1st line (where vscr and vrsave are stored) */
	dcbt      0, r3
	/* Point to start of general vector-register area             */
	addi      r3, r3, PPC_CACHE_ALIGNMENT
	/* Start preloading 2nd line (where first two vectors are)    */
	dcbt      0, r3
	L_VSCR_VRSAVE r3, r0, v0
	CMP_BASES     r3, r4, r5, r6, r10
	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
	dcbt      0, r5
	L_V0TOV19 r3, r4, r5, r6, r10, r11

#ifndef IGNORE_VRSAVE
	mtcr      r9
#endif
	blr

	.global _CPU_Context_switch_altivec
_CPU_Context_switch_altivec:

	/* fetch offset of altivec area in context                   */
	CMPOFF    r5
	/* down-align 'to' area to cache-line boundary               */
	add       r4, r4, r5
	CACHE_DOWNALGN r4

	/* Check for PSIM                                            */
	lis       r6, _CPU_altivec_psim_cpu@ha
	lwz       r6, _CPU_altivec_psim_cpu@l(r6)
	cmpli     0, r6, 0
	bne       1f
	/* Skip data-stream instructions on PSIM (not implemented)   */
	dssall
	/* Pre-load new context into cache                           */
	lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
	ori       r6, r6, BSTRIDE
	dstt      r4, r6, ds0
1:

#ifndef IGNORE_VRSAVE
	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
	 * when testing if we really should do the load/store operation.
	 */
	mfcr      r9
#endif

	/* Is 'from' context == NULL ? (then we just do a 'restore') */
	cmpli     0, r3, 0
	beq       1f           /* yes: skip saving 'from' context    */

	/* SAVE NON-VOLATILE REGISTERS                               */

	/* Compute aligned destination pointer (r5 still holds offset
	 * to 'altivec' area in context)
	 */
	add       r3, r3, r5
	CACHE_DOWNALGN r3

	PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
	/* The manual says reading vscr can take some time - do 
	 * read it here (into a volatile vector register) while
	 * we wait for cache blocks to be allocated
	 */
	mfvscr    v0
	S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
	S_VSCR_VRSAVE r0, v0, r3, r5

1:

	/* LOAD NON-VOLATILE REGISTERS                               */

	/* Advance past vrsave/vscr area                             */
	addi      r4, r4, PPC_CACHE_ALIGNMENT
	L_VSCR_VRSAVE r4, r0, v0
	CMP_BASES r4, r5, r6, r7, r10
	L_V20TOV31 r4, r5, r6, r7, r10, r11

#ifndef IGNORE_VRSAVE
	mtcr      r9
#endif
	blr

	.global _CPU_Context_initialize_altivec
_CPU_Context_initialize_altivec:
	CMPOFF    r5
	add       r3, r3, r5
	CACHE_DOWNALGN r3
	lis       r5, _CPU_altivec_vrsave_initval@ha
	lwz       r5, _CPU_altivec_vrsave_initval@l(r5)
	stw       r5, VRSAVE_OFF(r3)
	lis       r6, _CPU_altivec_vscr_initval@ha
	lwz       r6, _CPU_altivec_vscr_initval@l(r6)
	stw       r6, VSCR_OFF(r3)
	blr

	/*
	 * Change the initial value of VRSAVE.
	 * Can be used by initialization code if
	 * it is determined that code was compiled
	 * with -mvrsave=no. In this case, VRSAVE
	 * must be set to all-ones which causes this
	 * support code to save/restore *all* registers
	 * (only has an effect if IGNORE_VRSAVE is
	 * not defined -- otherwise all registers are
	 * saved/restored anyways).
	 */
	.global _CPU_altivec_set_vrsave_initval
_CPU_altivec_set_vrsave_initval:
	lis       r5, _CPU_altivec_vrsave_initval@ha
	stw       r3, _CPU_altivec_vrsave_initval@l(r5)
	mtvrsave  r3
	blr

#ifdef ALTIVEC_TESTING
	.global msr_VE_on
msr_VE_on:
	mfmsr r3
	oris  r3, r3, 1<<(31-6-16)
	mtmsr r3
	blr

	.global msr_VE_off
msr_VE_off:
	mfmsr r3
	lis   r4,  1<<(31-6-16)
	andc  r3, r3, r4
	mtmsr r3
	blr


	.global mfvrsave
mfvrsave:
	mfvrsave r3
	blr

	.global mtvrsave
mtvrsave:
	mtvrsave r3
	blr

	/* Load all vector registers from memory area.
	 * NOTE: This routine is not strictly ABI compliant --
	 *       it guarantees that volatile vector registers
	 *       have certain values on exit!
	 */
	.global _CPU_altivec_load_all
_CPU_altivec_load_all:
	/* Align address up to next cache-line boundary */
	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
	CACHE_DOWNALGN r3
#ifndef IGNORE_VRSAVE
	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
	 * when testing if we really should do the load/store operation.
	 */
	mfcr      r9
#endif

	/* Try to preload 1st line (where vscr and vrsave are stored) */
	dcbt      0, r3
	/* Point to start of general vector-register area             */
	addi      r3, r3, PPC_CACHE_ALIGNMENT
	/* Start preloading 2nd line (where first two vectors are)    */
	dcbt      0, r3
	L_VSCR_VRSAVE r3, r0, v0
	CMP_BASES     r3, r4, r5, r6, r10
	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
	dcbt      0, r5
	L_V0TOV31 r3, r4, r5, r6, r10, r11

#ifndef IGNORE_VRSAVE
	mtcr      r9
#endif
	blr

	.global _CPU_altivec_save_all
_CPU_altivec_save_all:
	/* Align address up to next cache-line boundary */
	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
	CACHE_DOWNALGN r3

#ifndef IGNORE_VRSAVE
	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
	 * when testing if we really should do the load/store operation.
	 */
	mfcr      r9
#endif

	PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
	/* r0 now contains VRSAVE, r3 still the aligned memory area
	 * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
	 * respectively. r10 holds zero
	 */
	S_V0TOV31     _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
	mfvscr        v0
	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
	S_VSCR_VRSAVE r0, v0, r3, r11

#ifndef IGNORE_VRSAVE
	/* Restore CRC */
	mtcr      r9
#endif
	blr


#if 0
	.gnu_attribute 4,1
	.gnu_attribute 8,1
#endif

#endif
#endif