1 | #ifdef __ALTIVEC__ |
---|
2 | |
---|
3 | /* Altivec support for RTEMS; vector register context management. */ |
---|
4 | |
---|
5 | /* |
---|
6 | * Authorship |
---|
7 | * ---------- |
---|
8 | * This software was created by |
---|
9 | * Till Straumann <strauman@slac.stanford.edu>, 2009, |
---|
10 | * Stanford Linear Accelerator Center, Stanford University. |
---|
11 | * |
---|
12 | * Acknowledgement of sponsorship |
---|
13 | * ------------------------------ |
---|
14 | * This software was produced by |
---|
15 | * the Stanford Linear Accelerator Center, Stanford University, |
---|
16 | * under Contract DE-AC03-76SFO0515 with the Department of Energy. |
---|
17 | * |
---|
18 | * Government disclaimer of liability |
---|
19 | * ---------------------------------- |
---|
20 | * Neither the United States nor the United States Department of Energy, |
---|
21 | * nor any of their employees, makes any warranty, express or implied, or |
---|
22 | * assumes any legal liability or responsibility for the accuracy, |
---|
23 | * completeness, or usefulness of any data, apparatus, product, or process |
---|
24 | * disclosed, or represents that its use would not infringe privately owned |
---|
25 | * rights. |
---|
26 | * |
---|
27 | * Stanford disclaimer of liability |
---|
28 | * -------------------------------- |
---|
29 | * Stanford University makes no representations or warranties, express or |
---|
30 | * implied, nor assumes any liability for the use of this software. |
---|
31 | * |
---|
32 | * Stanford disclaimer of copyright |
---|
33 | * -------------------------------- |
---|
34 | * Stanford University, owner of the copyright, hereby disclaims its |
---|
35 | * copyright and all other rights in this software. Hence, anyone may |
---|
36 | * freely use it for any purpose without restriction. |
---|
37 | * |
---|
38 | * Maintenance of notices |
---|
39 | * ---------------------- |
---|
40 | * In the interest of clarity regarding the origin and status of this |
---|
41 | * SLAC software, this and all the preceding Stanford University notices |
---|
42 | * are to remain affixed to any copy or derivative of this software made |
---|
43 | * or distributed by the recipient and are to be affixed to any copy of |
---|
44 | * software made or distributed by the recipient that contains a copy or |
---|
45 | * derivative of this software. |
---|
46 | * |
---|
47 | * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 |
---|
48 | */ |
---|
49 | |
---|
50 | |
---|
51 | #include <rtems/powerpc/powerpc.h> |
---|
52 | |
---|
53 | #ifndef PPC_CACHE_ALIGNMENT |
---|
54 | #error "Missing header; PPC_CACHE_ALIGNMENT is not defined" |
---|
55 | #endif |
---|
56 | |
---|
57 | #define ALTIVEC_TESTING |
---|
58 | |
---|
59 | #if PPC_CACHE_ALIGNMENT != 32 |
---|
60 | #error "Altivec support assumes cache-line size is 32 bytes!" |
---|
61 | #else |
---|
62 | #undef LD_PPC_CACHE_ALIGNMENT |
---|
63 | #define LD_PPC_CACHE_ALIGNMENT 5 |
---|
64 | #endif |
---|
65 | |
---|
66 | .set v0, 0 |
---|
67 | .set v8, 8 |
---|
68 | .set v16, 16 |
---|
69 | .set v20, 20 |
---|
70 | .set v24, 24 |
---|
71 | .set v28, 28 |
---|
72 | |
---|
73 | .set r0, 0 |
---|
74 | .set r3, 3 |
---|
75 | .set r4, 4 |
---|
76 | .set r5, 5 |
---|
77 | .set r6, 6 |
---|
78 | .set r7, 7 |
---|
79 | |
---|
80 | .set r10, 10 |
---|
81 | .set r11, 11 |
---|
82 | .set r12, 12 |
---|
83 | |
---|
84 | .set cr5, 5 |
---|
85 | |
---|
86 | .set VECSIZE, 16 |
---|
87 | |
---|
88 | .set VRSAVE_INIT_VAL, 0 |
---|
89 | .set VSCR_INIT_VAL, 0 |
---|
90 | |
---|
91 | .set VRSAVE_OFF, 16 |
---|
92 | .set VSCR_OFF, 16+12 |
---|
93 | |
---|
94 | .set ds0, 0 |
---|
95 | |
---|
96 | /* Block size for dst -- in units of 16-bytes */ |
---|
97 | .set BSIZE, 2 /* = 32 bytes */ |
---|
98 | .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ |
---|
99 | .set BSTRIDE, 32 /* bytes */ |
---|
100 | |
---|
101 | .data |
---|
102 | |
---|
103 | .global _CPU_altivec_vrsave_initval |
---|
104 | _CPU_altivec_vrsave_initval: |
---|
105 | .long 0 |
---|
106 | |
---|
107 | .global _CPU_altivec_vscr_initval |
---|
108 | _CPU_altivec_vscr_initval: |
---|
109 | .long 0 |
---|
110 | |
---|
111 | .text |
---|
112 | |
---|
113 | .extern _CPU_altivec_psim_cpu |
---|
114 | .extern _CPU_altivec_ctxt_off |
---|
115 | |
---|
116 | .macro CMPOFF _B0 |
---|
117 | lis \_B0, _CPU_altivec_ctxt_off@ha |
---|
118 | lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) |
---|
119 | .endm |
---|
120 | |
---|
121 | /* Conditionally load or store a vector _VR to |
---|
122 | * EA(_R1|0 + _R2) |
---|
123 | * If bit _VR (corresponding to _VR) is set in CRC |
---|
124 | * then the load/store is performed but otherwise |
---|
125 | * it is skipped. |
---|
126 | * If compiled with IGNORE_VRSAVE defined then |
---|
127 | * the load/store is done unconditionally. |
---|
128 | * |
---|
129 | * _OPCODE: intended to be lvx, lvxl, stvx or stvxl |
---|
130 | * _VR : target vector register |
---|
131 | * _R1 : base register (NOTE: _R1=r0 uses a |
---|
132 | * implicit ZERO constant, not the contents |
---|
133 | * of r0) for address computation. |
---|
134 | * _R2 : 'offset' register for address computation. |
---|
135 | * |
---|
136 | * MODIFIES: _VR on output if a load operation is performed. |
---|
137 | * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE |
---|
138 | * defined. |
---|
139 | */ |
---|
140 | .macro LDST _OPCODE, _VR, _R1, _R2 |
---|
141 | #ifndef IGNORE_VRSAVE |
---|
142 | bc 4, \_VR, 111f |
---|
143 | #endif |
---|
144 | \_OPCODE \_VR, \_R1, \_R2 |
---|
145 | 111: |
---|
146 | .endm |
---|
147 | |
---|
148 | /* |
---|
149 | * Load or store four 'adjacent' vector registers. |
---|
150 | * |
---|
151 | * _OPCODE: intended to be lvx, lvxl, stvx or stvxl |
---|
152 | * _VR : target vector register |
---|
153 | * _R1 : base register (NOTE: _R1=r0 uses a |
---|
154 | * implicit ZERO constant, not the contents |
---|
155 | * of r0) for address computation. |
---|
156 | * _B0 : base register 0 |
---|
157 | * _B1 : base register 1 |
---|
158 | * _B2 : base register 2 |
---|
159 | * _B3 : base register 3 |
---|
160 | * _RO : offset register |
---|
161 | * |
---|
162 | * memory addresses for _VR, _VR+1, _VR+2, _VR+3 |
---|
163 | * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. |
---|
164 | * |
---|
165 | * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load |
---|
166 | * operation is performed. |
---|
167 | * IMPLICIT USE: see LDST |
---|
168 | */ |
---|
169 | .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO |
---|
170 | LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO |
---|
171 | LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO |
---|
172 | LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO |
---|
173 | LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO |
---|
174 | .endm |
---|
175 | |
---|
176 | /* |
---|
177 | * Preload/zero two cache lines and save 4 vector registers |
---|
178 | * to memory. |
---|
179 | * Note that the cache operation targets memory *past* the |
---|
180 | * current storage area which should hopefully hit when |
---|
181 | * This same code is executed on the next two cache lines... |
---|
182 | * |
---|
183 | * This code effectively does |
---|
184 | * dcbz (_B0 + 64) |
---|
185 | * dcbz (_B0 + 64 + 32) |
---|
186 | * stvx _VF+0, (_B0+ 0) |
---|
187 | * stvx _VF+1, (_B0+16) |
---|
188 | * stvx _VF+2, (_B0+32) |
---|
189 | * stvx _VF+3, (_B0+48) |
---|
190 | * |
---|
191 | * _LRU: may be 'l' or empty. The former variant should be |
---|
192 | * used when it is conceivable that the memory area is |
---|
193 | * unlikely to be used in the near future thus making |
---|
194 | * it a candidate for early eviction from the caches. |
---|
195 | * |
---|
196 | * If it is likely that the memory area is reused soon |
---|
197 | * (e.g., save/restore across ISR execution) then the |
---|
198 | * 'stvx' opcode (w/o 'l' suffix) should be used. |
---|
199 | * |
---|
200 | * _VR: first of four target vector registers; _VR+0, |
---|
201 | * _VR+1, _VR+2, _VR+3 are saved. |
---|
202 | * |
---|
203 | * _BO: base address of memory area. |
---|
204 | * _B1: should contain _B0+16 on entry |
---|
205 | * _B2: should contain _B0+32 on entry |
---|
206 | * _B3: should contain _B0+48 on entry |
---|
207 | * |
---|
208 | * _O1: contains the offset where the four vectors are |
---|
209 | * stored. |
---|
210 | * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) |
---|
211 | * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) |
---|
212 | * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) |
---|
213 | * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) |
---|
214 | * _O2: is set to _O1 + 64 by this macro. Hence _O2 is |
---|
215 | * used to address the two cache-lines past the |
---|
216 | * current memory area. |
---|
217 | * |
---|
218 | * MODIFIES: _O2; contains _O1 + 64 after execution of this |
---|
219 | * code. |
---|
220 | * |
---|
221 | * NOTES: a different set of four vectors can be addressed |
---|
222 | * simply by changing the one offset register _O1. |
---|
223 | * |
---|
224 | * Saving more than 4 registers can simply be |
---|
225 | * achieved by expanding this macro multiple |
---|
226 | * times with _O1 and _O2 swapped (new _O1 |
---|
227 | * becomes _O2 = old _O1 + 64) thus stepping |
---|
228 | * through the memory area. |
---|
229 | * |
---|
230 | */ |
---|
231 | .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
232 | addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT |
---|
233 | dcbz \_B0, \_O2 |
---|
234 | dcbz \_B2, \_O2 |
---|
235 | LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 |
---|
236 | .endm |
---|
237 | |
---|
238 | /* |
---|
239 | * Save eight vector registers by expanding S4VEC_P twice. |
---|
240 | * See notes for S4VEC_P above. |
---|
241 | * |
---|
242 | * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) |
---|
243 | * |
---|
244 | * MODIFIES: After execution, |
---|
245 | * _O2 contains original _O1 + 64, |
---|
246 | * _O1 contains original _O1 + 128 |
---|
247 | * |
---|
248 | * NOTES: Expanding this macro multiple times lets you save |
---|
249 | * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). |
---|
250 | */ |
---|
251 | .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
252 | S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 |
---|
253 | /* Note that the roles of _O1 and _O2 are swapped here */ |
---|
254 | S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 |
---|
255 | .endm |
---|
256 | |
---|
257 | /* |
---|
258 | * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) |
---|
259 | * |
---|
260 | * See notes above (for S4VEC_P). |
---|
261 | * |
---|
262 | * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) |
---|
263 | * MODIFIES: _O1 contains original _O1 + 256 |
---|
264 | * _O2 contains original _O1 + 256 - 64 |
---|
265 | */ |
---|
266 | .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
267 | S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 |
---|
268 | S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 |
---|
269 | LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 |
---|
270 | .endm |
---|
271 | |
---|
272 | /* |
---|
273 | * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) |
---|
274 | * |
---|
275 | * See notes above (for S4VEC_P, S_V0TOV19). |
---|
276 | * |
---|
277 | * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) |
---|
278 | * MODIFIES: _O1 contains original _O1 + 128 |
---|
279 | * _O2 contains original _O1 + 128 - 64 |
---|
280 | */ |
---|
281 | .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
282 | S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 |
---|
283 | LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 |
---|
284 | .endm |
---|
285 | |
---|
286 | /* |
---|
287 | * Save all registers to memory area |
---|
288 | * |
---|
289 | * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) |
---|
290 | * MODIFIES: _O1 contains original _O1 + 512 |
---|
291 | * _O2 contains original _O1 + 512 - 64 |
---|
292 | */ |
---|
293 | .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 |
---|
294 | S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 |
---|
295 | S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 |
---|
296 | S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 |
---|
297 | S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 |
---|
298 | LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 |
---|
299 | .endm |
---|
300 | |
---|
301 | |
---|
302 | /* |
---|
303 | * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. |
---|
304 | * We can pass either of them as arguments to another macro which |
---|
305 | * allows us to decide if the main macro uses dcbt or not when |
---|
306 | * we expand it... |
---|
307 | */ |
---|
308 | .macro DO_DCBT _RA, _RB |
---|
309 | dcbt \_RA, \_RB |
---|
310 | .endm |
---|
311 | |
---|
312 | .macro NO_DCBT _RA, _RB |
---|
313 | .endm |
---|
314 | |
---|
315 | /* |
---|
316 | * NOTE REGARDING dcbt VS dst |
---|
317 | * |
---|
318 | * Preloading the cache with memory areas that we soon need |
---|
319 | * can be done either using 'dcbt' or 'dst' instructions |
---|
320 | * "ahead of time". |
---|
321 | * When experimenting (on a mpc7457) I found that the 'dst' |
---|
322 | * stream instruction was very efficient if there is enough |
---|
323 | * time to read ahead. It works well when we do a context |
---|
324 | * switch: |
---|
325 | * |
---|
326 | * 1) start DST on new context to be loaded |
---|
327 | * 2) save old context to memory |
---|
328 | * 3) load new context from memory |
---|
329 | * |
---|
330 | * Because of the interleaved step 2) dst works nicely and |
---|
331 | * 3) finds what it needs in the cache. |
---|
332 | * |
---|
333 | * However, in a situation when there is not much time |
---|
334 | * to start the DST, e.g., because we want to restore |
---|
335 | * a context out of the blue (e.g., after returning |
---|
336 | * from and ISR): |
---|
337 | * |
---|
338 | * 1) save volatile registers to memory/stack |
---|
339 | * 2) execute ISR |
---|
340 | * 3) might do a task context switch |
---|
341 | * 4) when returned to old task context then |
---|
342 | * reload volatile registers from memory/stack. |
---|
343 | * |
---|
344 | * In this situation, preloading the target memory before |
---|
345 | * or after step 1) makes obviously no sense because after |
---|
346 | * 1) the registers area is most likely in the cache already. |
---|
347 | * |
---|
348 | * Starting preload after 2) doesn't make much sense either. |
---|
349 | * If ISR doesn't lead to a context switch then it is quite |
---|
350 | * likely that the register area is still in the cache. |
---|
351 | * OTOTH, if a context switch happens then the preload after 2) |
---|
352 | * might be useless. |
---|
353 | * |
---|
354 | * This leaves us at step 4) where we want to load immediately. |
---|
355 | * In this case, I found that 'dcbt' works more efficiently |
---|
356 | * so that's what we use when restoring volatile registers. |
---|
357 | * |
---|
358 | * When restoring the non-volatile VRs during a 'normal' |
---|
359 | * context switch then we shall use DST (and no dcbt). |
---|
360 | */ |
---|
361 | |
---|
362 | /* |
---|
363 | * Symmetric to S4VEC_P above but addresses loading four |
---|
364 | * vector registers from memory. |
---|
365 | * |
---|
366 | * Touches two cache lines past the current memory area |
---|
367 | * and loads four vectors from the current area. |
---|
368 | * |
---|
369 | * Optionally, the DCBT operation may be omitted |
---|
370 | * (when expanding with _DCBT=NO_DCBT). |
---|
371 | * This is useful if the cache was already preloaded |
---|
372 | * by another means (dst instruction). |
---|
373 | * |
---|
374 | * NOTE: We always use the 'LRU' form of lvx: lvxl, |
---|
375 | * because we deem it unlikely that the context |
---|
376 | * that was just loaded has to be saved again |
---|
377 | * to memory in the immediate future. |
---|
378 | * |
---|
379 | * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded |
---|
380 | * as explained above. |
---|
381 | * |
---|
382 | * MODIFIES: _O2 contains original _O1 + 64. |
---|
383 | * _VR.._VR+3 loaded from memory. |
---|
384 | */ |
---|
385 | .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
386 | addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT |
---|
387 | /* preload/touch 2 lines at offset 64 from _B0 */ |
---|
388 | \_DCBT \_B0, \_O2 |
---|
389 | \_DCBT \_B2, \_O2 |
---|
390 | /* load four vectors at off set 0 from _B0 */ |
---|
391 | LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 |
---|
392 | .endm |
---|
393 | |
---|
394 | /* |
---|
395 | * Symmetric to S8VEC_P; loads 8 vector registers |
---|
396 | * from memory -- see comments above... |
---|
397 | * |
---|
398 | * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded |
---|
399 | * as explained above. |
---|
400 | * |
---|
401 | * MODIFIES: _O1 contains original _O1 + 128. |
---|
402 | * _O2 contains original _O1 + 64. |
---|
403 | * _VR.._VR+7 loaded from memory. |
---|
404 | */ |
---|
405 | .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 |
---|
406 | L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
407 | L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 |
---|
408 | .endm |
---|
409 | |
---|
410 | /* |
---|
411 | * Load volatile vector registers v0..v19 employing |
---|
412 | * the DCBT to preload the cache. The rationale for |
---|
413 | * using DCBT here but not when restoring non-volatile |
---|
414 | * registers is explained above, see |
---|
415 | * |
---|
416 | * "NOTE REGARDING dcbt VS dst" |
---|
417 | * |
---|
418 | * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded |
---|
419 | * as explained above. |
---|
420 | * |
---|
421 | * MODIFIES: _O1 contains original _O1 + 256. |
---|
422 | * _O2 contains original _O1 + 256 - 64. |
---|
423 | * VR0..VR19 loaded from memory. |
---|
424 | */ |
---|
425 | .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 |
---|
426 | L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
427 | L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
428 | LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 |
---|
429 | .endm |
---|
430 | |
---|
431 | /* |
---|
432 | * Load non-volatile vector registers v20..v31. |
---|
433 | * Note that no DCBT is performed since we use |
---|
434 | * DST for preloading the cache during a context |
---|
435 | * switch, see |
---|
436 | * |
---|
437 | * "NOTE REGARDING dcbt VS dst" |
---|
438 | * |
---|
439 | * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded |
---|
440 | * as explained above. |
---|
441 | * |
---|
442 | * MODIFIES: _O1 contains original _O1 + 128. |
---|
443 | * _O2 contains original _O1 + 128 - 64. |
---|
444 | * VR20..VR31 loaded from memory. |
---|
445 | */ |
---|
446 | .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 |
---|
447 | L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
448 | LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 |
---|
449 | .endm |
---|
450 | |
---|
451 | /* |
---|
452 | * Load all registers from memory area. |
---|
453 | */ |
---|
454 | .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 |
---|
455 | L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
456 | L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
457 | L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
458 | L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 |
---|
459 | LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 |
---|
460 | .endm |
---|
461 | |
---|
462 | /* |
---|
463 | * Compute |
---|
464 | * _B1 = _B0 + 16 |
---|
465 | * _B2 = _B0 + 32 |
---|
466 | * _B3 = _B0 + 48 |
---|
467 | * and load |
---|
468 | * _RO = 0 |
---|
469 | * |
---|
470 | * convenience macro to be expanded before |
---|
471 | * any of the load/store macros that use |
---|
472 | * four base addresses etc. |
---|
473 | * |
---|
474 | * INPUT: _B0 = cache-aligned start of memory area |
---|
475 | * |
---|
476 | * MODIFIES: _B1, _B2, _B3, _RO as described above. |
---|
477 | */ |
---|
478 | .macro CMP_BASES _B0, _B1, _B2, _B3, _RO |
---|
479 | addi \_B1, \_B0, 1*VECSIZE |
---|
480 | addi \_B2, \_B0, 2*VECSIZE |
---|
481 | addi \_B3, \_B0, 3*VECSIZE |
---|
482 | li \_RO, 0 |
---|
483 | .endm |
---|
484 | |
---|
485 | /* |
---|
486 | * Prepare for saving general vector registers. |
---|
487 | * |
---|
488 | * If not built with #define IGNORE_VRSAVE then |
---|
489 | * |
---|
490 | * 1) copy vrsave to CRC |
---|
491 | * |
---|
492 | * endif |
---|
493 | * |
---|
494 | * 2) copy vrsave to _VRSAVE_REG |
---|
495 | * 3) preload/zero cache line where vrsave and vscr are stored. |
---|
496 | * 4) compute base adresses from _B0 |
---|
497 | * 5) preload/zero first two cache lines (remember that the |
---|
498 | * first S8VEC_P starts preloading/zeroing at offset 64). |
---|
499 | * |
---|
500 | * INPUT: 'vrsave' register, _B0 (base address of memory area) |
---|
501 | * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') |
---|
502 | * _B0 = original _BO + 32 |
---|
503 | * _B1 = original _B0 + 32 + 16, |
---|
504 | * _B2 = original _B0 + 32 + 32, |
---|
505 | * _B3 = original _B0 + 32 + 48, |
---|
506 | * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) |
---|
507 | */ |
---|
508 | .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO |
---|
509 | mfvrsave \_VRSAVE_REG |
---|
510 | #ifndef IGNORE_VRSAVE |
---|
511 | mtcr \_VRSAVE_REG |
---|
512 | #endif |
---|
513 | dcbz 0, \_B0 |
---|
514 | addi \_B0, \_B0, PPC_CACHE_ALIGNMENT |
---|
515 | dcbz 0, \_B0 |
---|
516 | CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO |
---|
517 | dcbz 0, \_B2 |
---|
518 | .endm |
---|
519 | |
---|
520 | /* |
---|
521 | * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers |
---|
522 | * must have been loaded from 'vrsave' and 'vscr', respectively, |
---|
523 | * prior to expanding this macro. |
---|
524 | * |
---|
525 | * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents |
---|
526 | * _VSCR_VREG VR holding 'vscr' contents |
---|
527 | * _B0 cache-aligned (base) address of memory area. |
---|
528 | * MODIFIES: _SCRATCH_REG |
---|
529 | */ |
---|
530 | .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG |
---|
531 | stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) |
---|
532 | li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF |
---|
533 | stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG |
---|
534 | .endm |
---|
535 | |
---|
536 | /* |
---|
537 | * Load 'vrsave' and 'vscr' from memory. |
---|
538 | * |
---|
539 | * INPUTS: _B0 cache-aligned (base) address of memory area. |
---|
540 | * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) |
---|
541 | * 'vscr', 'vrsave'. |
---|
542 | * CRC (holds contents of 'vrsave') (ONLY IF COMPILED |
---|
543 | * with IGNORE_VRSAVE undefined). |
---|
544 | */ |
---|
545 | .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG |
---|
546 | lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) |
---|
547 | mtvrsave \_SCRATCH_REG |
---|
548 | #ifndef IGNORE_VRSAVE |
---|
549 | mtcr \_SCRATCH_REG |
---|
550 | #endif |
---|
551 | li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF |
---|
552 | lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG |
---|
553 | mtvscr \_SCRATCH_VREG |
---|
554 | .endm |
---|
555 | |
---|
556 | /* |
---|
557 | * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) |
---|
558 | * |
---|
559 | * INPUT: _B0 |
---|
560 | * MODIFIES: _B0 (as stated above) |
---|
561 | */ |
---|
562 | .macro CACHE_DOWNALGN _B0 |
---|
563 | rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT |
---|
564 | .endm |
---|
565 | |
---|
566 | .text |
---|
567 | |
---|
568 | .global _CPU_save_altivec_volatile |
---|
569 | _CPU_save_altivec_volatile: |
---|
570 | /* Align address up to next cache-line boundary */ |
---|
571 | addi r3, r3, PPC_CACHE_ALIGNMENT - 1 |
---|
572 | CACHE_DOWNALGN r3 |
---|
573 | |
---|
574 | #ifndef IGNORE_VRSAVE |
---|
575 | /* Save CRC -- it is used implicitly by all the LOAD/STORE macros |
---|
576 | * when testing if we really should do the load/store operation. |
---|
577 | */ |
---|
578 | mfcr r12 |
---|
579 | #endif |
---|
580 | |
---|
581 | PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 |
---|
582 | /* r0 now contains VRSAVE, r3 still the aligned memory area |
---|
583 | * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, |
---|
584 | * respectively. r10 holds zero |
---|
585 | */ |
---|
586 | S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 |
---|
587 | mfvscr v0 |
---|
588 | /* Store vrsave (still in r0) and vscr (in v0) to memory area */ |
---|
589 | S_VSCR_VRSAVE r0, v0, r3, r11 |
---|
590 | |
---|
591 | #ifndef IGNORE_VRSAVE |
---|
592 | /* Restore CRC */ |
---|
593 | mtcr r12 |
---|
594 | #endif |
---|
595 | blr |
---|
596 | |
---|
597 | .global _CPU_load_altivec_volatile |
---|
598 | _CPU_load_altivec_volatile: |
---|
599 | /* Align address up to next cache-line boundary */ |
---|
600 | addi r3, r3, PPC_CACHE_ALIGNMENT - 1 |
---|
601 | CACHE_DOWNALGN r3 |
---|
602 | #ifndef IGNORE_VRSAVE |
---|
603 | /* Save CRC -- it is used implicitly by all the LOAD/STORE macros |
---|
604 | * when testing if we really should do the load/store operation. |
---|
605 | */ |
---|
606 | mfcr r12 |
---|
607 | #endif |
---|
608 | |
---|
609 | /* Try to preload 1st line (where vscr and vrsave are stored) */ |
---|
610 | dcbt 0, r3 |
---|
611 | /* Point to start of general vector-register area */ |
---|
612 | addi r3, r3, PPC_CACHE_ALIGNMENT |
---|
613 | /* Start preloading 2nd line (where first two vectors are) */ |
---|
614 | dcbt 0, r3 |
---|
615 | L_VSCR_VRSAVE r3, r0, v0 |
---|
616 | CMP_BASES r3, r4, r5, r6, r10 |
---|
617 | /* Start preloading 3rd line (where vectors 3 and 4 are) */ |
---|
618 | dcbt 0, r5 |
---|
619 | L_V0TOV19 r3, r4, r5, r6, r10, r11 |
---|
620 | |
---|
621 | #ifndef IGNORE_VRSAVE |
---|
622 | mtcr r12 |
---|
623 | #endif |
---|
624 | blr |
---|
625 | |
---|
626 | .global _CPU_Context_restore_altivec |
---|
627 | _CPU_Context_restore_altivec: |
---|
628 | /* Restore is like 'switch' but we don't have |
---|
629 | * to save an old context. |
---|
630 | * Move argument to second arg and load NULL pointer |
---|
631 | * to first one, then jump to 'switch' routine. |
---|
632 | */ |
---|
633 | mr r4, r3 |
---|
634 | li r3, 0 |
---|
635 | b _CPU_Context_switch_altivec |
---|
636 | |
---|
637 | .global _CPU_Context_switch_altivec |
---|
638 | _CPU_Context_switch_altivec: |
---|
639 | |
---|
640 | /* fetch offset of altivec area in context */ |
---|
641 | CMPOFF r5 |
---|
642 | /* down-align 'to' area to cache-line boundary */ |
---|
643 | add r4, r4, r5 |
---|
644 | CACHE_DOWNALGN r4 |
---|
645 | |
---|
646 | /* Check for PSIM */ |
---|
647 | lis r6, _CPU_altivec_psim_cpu@ha |
---|
648 | lwz r6, _CPU_altivec_psim_cpu@l(r6) |
---|
649 | cmpli 0, r6, 0 |
---|
650 | bne 1f |
---|
651 | /* Skip data-stream instructions on PSIM (not implemented) */ |
---|
652 | dssall |
---|
653 | /* Pre-load new context into cache */ |
---|
654 | lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) |
---|
655 | ori r6, r6, BSTRIDE |
---|
656 | dstt r4, r6, ds0 |
---|
657 | 1: |
---|
658 | |
---|
659 | #ifndef IGNORE_VRSAVE |
---|
660 | /* Save CRC -- it is used implicitly by all the LOAD/STORE macros |
---|
661 | * when testing if we really should do the load/store operation. |
---|
662 | */ |
---|
663 | mfcr r12 |
---|
664 | #endif |
---|
665 | |
---|
666 | /* Is 'from' context == NULL ? (then we just do a 'restore') */ |
---|
667 | cmpli 0, r3, 0 |
---|
668 | beq 1f /* yes: skip saving 'from' context */ |
---|
669 | |
---|
670 | /* SAVE NON-VOLATILE REGISTERS */ |
---|
671 | |
---|
672 | /* Compute aligned destination pointer (r5 still holds offset |
---|
673 | * to 'altivec' area in context) |
---|
674 | */ |
---|
675 | add r3, r3, r5 |
---|
676 | CACHE_DOWNALGN r3 |
---|
677 | |
---|
678 | PREP_FOR_SAVE r0, r3, r5, r6, r7, r10 |
---|
679 | /* The manual says reading vscr can take some time - do |
---|
680 | * read it here (into a volatile vector register) while |
---|
681 | * we wait for cache blocks to be allocated |
---|
682 | */ |
---|
683 | mfvscr v0 |
---|
684 | S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11 |
---|
685 | /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ |
---|
686 | S_VSCR_VRSAVE r0, v0, r3, r5 |
---|
687 | |
---|
688 | 1: |
---|
689 | |
---|
690 | /* LOAD NON-VOLATILE REGISTERS */ |
---|
691 | |
---|
692 | /* Advance past vrsave/vscr area */ |
---|
693 | addi r4, r4, PPC_CACHE_ALIGNMENT |
---|
694 | L_VSCR_VRSAVE r4, r0, v0 |
---|
695 | CMP_BASES r4, r5, r6, r7, r10 |
---|
696 | L_V20TOV31 r4, r5, r6, r7, r10, r11 |
---|
697 | |
---|
698 | #ifndef IGNORE_VRSAVE |
---|
699 | mtcr r12 |
---|
700 | #endif |
---|
701 | blr |
---|
702 | |
---|
703 | .global _CPU_Context_initialize_altivec |
---|
704 | _CPU_Context_initialize_altivec: |
---|
705 | CMPOFF r5 |
---|
706 | add r3, r3, r5 |
---|
707 | CACHE_DOWNALGN r3 |
---|
708 | lis r5, _CPU_altivec_vrsave_initval@ha |
---|
709 | lwz r5, _CPU_altivec_vrsave_initval@l(r5) |
---|
710 | stw r5, VRSAVE_OFF(r3) |
---|
711 | lis r6, _CPU_altivec_vscr_initval@ha |
---|
712 | lwz r6, _CPU_altivec_vscr_initval@l(r6) |
---|
713 | stw r6, VSCR_OFF(r3) |
---|
714 | blr |
---|
715 | |
---|
716 | /* |
---|
717 | * Change the initial value of VRSAVE. |
---|
718 | * Can be used by initialization code if |
---|
719 | * it is determined that code was compiled |
---|
720 | * with -mvrsave=no. In this case, VRSAVE |
---|
721 | * must be set to all-ones which causes this |
---|
722 | * support code to save/restore *all* registers |
---|
723 | * (only has an effect if IGNORE_VRSAVE is |
---|
724 | * not defined -- otherwise all registers are |
---|
725 | * saved/restored anyways). |
---|
726 | */ |
---|
727 | .global _CPU_altivec_set_vrsave_initval |
---|
728 | _CPU_altivec_set_vrsave_initval: |
---|
729 | lis r5, _CPU_altivec_vrsave_initval@ha |
---|
730 | stw r3, _CPU_altivec_vrsave_initval@l(r5) |
---|
731 | mtvrsave r3 |
---|
732 | blr |
---|
733 | |
---|
734 | #ifdef ALTIVEC_TESTING |
---|
735 | .global msr_VE_on |
---|
736 | msr_VE_on: |
---|
737 | mfmsr r3 |
---|
738 | oris r3, r3, 1<<(31-6-16) |
---|
739 | mtmsr r3 |
---|
740 | blr |
---|
741 | |
---|
742 | .global msr_VE_off |
---|
743 | msr_VE_off: |
---|
744 | mfmsr r3 |
---|
745 | lis r4, 1<<(31-6-16) |
---|
746 | andc r3, r3, r4 |
---|
747 | mtmsr r3 |
---|
748 | blr |
---|
749 | |
---|
750 | |
---|
751 | .global mfvrsave |
---|
752 | mfvrsave: |
---|
753 | mfvrsave r3 |
---|
754 | blr |
---|
755 | |
---|
756 | .global mtvrsave |
---|
757 | mtvrsave: |
---|
758 | mtvrsave r3 |
---|
759 | blr |
---|
760 | |
---|
761 | /* Load all vector registers from memory area. |
---|
762 | * NOTE: This routine is not strictly ABI compliant -- |
---|
763 | * it guarantees that volatile vector registers |
---|
764 | * have certain values on exit! |
---|
765 | */ |
---|
766 | .global _CPU_altivec_load_all |
---|
767 | _CPU_altivec_load_all: |
---|
768 | /* Align address up to next cache-line boundary */ |
---|
769 | addi r3, r3, PPC_CACHE_ALIGNMENT - 1 |
---|
770 | CACHE_DOWNALGN r3 |
---|
771 | #ifndef IGNORE_VRSAVE |
---|
772 | /* Save CRC -- it is used implicitly by all the LOAD/STORE macros |
---|
773 | * when testing if we really should do the load/store operation. |
---|
774 | */ |
---|
775 | mfcr r12 |
---|
776 | #endif |
---|
777 | |
---|
778 | /* Try to preload 1st line (where vscr and vrsave are stored) */ |
---|
779 | dcbt 0, r3 |
---|
780 | /* Point to start of general vector-register area */ |
---|
781 | addi r3, r3, PPC_CACHE_ALIGNMENT |
---|
782 | /* Start preloading 2nd line (where first two vectors are) */ |
---|
783 | dcbt 0, r3 |
---|
784 | L_VSCR_VRSAVE r3, r0, v0 |
---|
785 | CMP_BASES r3, r4, r5, r6, r10 |
---|
786 | /* Start preloading 3rd line (where vectors 3 and 4 are) */ |
---|
787 | dcbt 0, r5 |
---|
788 | L_V0TOV31 r3, r4, r5, r6, r10, r11 |
---|
789 | |
---|
790 | #ifndef IGNORE_VRSAVE |
---|
791 | mtcr r12 |
---|
792 | #endif |
---|
793 | blr |
---|
794 | |
---|
795 | .global _CPU_altivec_save_all |
---|
796 | _CPU_altivec_save_all: |
---|
797 | /* Align address up to next cache-line boundary */ |
---|
798 | addi r3, r3, PPC_CACHE_ALIGNMENT - 1 |
---|
799 | CACHE_DOWNALGN r3 |
---|
800 | |
---|
801 | #ifndef IGNORE_VRSAVE |
---|
802 | /* Save CRC -- it is used implicitly by all the LOAD/STORE macros |
---|
803 | * when testing if we really should do the load/store operation. |
---|
804 | */ |
---|
805 | mfcr r12 |
---|
806 | #endif |
---|
807 | |
---|
808 | PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 |
---|
809 | /* r0 now contains VRSAVE, r3 still the aligned memory area |
---|
810 | * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, |
---|
811 | * respectively. r10 holds zero |
---|
812 | */ |
---|
813 | S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 |
---|
814 | mfvscr v0 |
---|
815 | /* Store vrsave (still in r0) and vscr (in v0) to memory area */ |
---|
816 | S_VSCR_VRSAVE r0, v0, r3, r11 |
---|
817 | |
---|
818 | #ifndef IGNORE_VRSAVE |
---|
819 | /* Restore CRC */ |
---|
820 | mtcr r12 |
---|
821 | #endif |
---|
822 | blr |
---|
823 | |
---|
824 | |
---|
825 | #if 0 |
---|
826 | .gnu_attribute 4,1 |
---|
827 | .gnu_attribute 8,1 |
---|
828 | #endif |
---|
829 | |
---|
830 | #endif |
---|
831 | #endif |
---|