Context Navigation

exception.S @ 969de1f3

4.104.114.84.95

Last change on this file since 969de1f3 was acc25ee, checked in by Joel Sherrill <joel.sherrill@…>, on 12/02/99 at 14:31:19
Merged of mcp750 and mvme2307 BSP by Eric Valette <valette@…>. As part of this effort, the mpc750 libcpu code is now shared with the ppc6xx.
Property mode set to `100644`
File size: 18.9 KB

Line
1	/*
2	* exception.S -- Exception handlers for early boot.
3	*
4	* Copyright (C) 1998, 1999 Gabriel Paubert, paubert@iram.es
5	*
6	* Modified to compile in RTEMS development environment
7	* by Eric Valette
8	*
9	* Copyright (C) 1999 Eric Valette. valette@crf.canon.fr
10	*
11	* The license and distribution terms for this file may be
12	* found in found in the file LICENSE in this distribution or at
13	* http://www.OARcorp.com/rtems/license.html.
14	*
15	* $Id$
16	*/
17
18	/* This is an improved version of the TLB interrupt handling code from
19	* the 603e users manual (603eUM.pdf) downloaded from the WWW. All the
20	* visible bugs have been removed. Note that many have survived in the errata
21	* to the 603 user manual (603UMer.pdf).
22	*
23	* This code also pays particular attention to optimization, takes into
24	* account the differences between 603 and 603e, single/multiple processor
25	* systems and tries to order instructions for dual dispatch in many places.
26	*
27	* The optimization has been performed along two lines:
28	* 1) to minimize the number of instruction cache lines needed for the most
29	* common execution paths (the ones that do not result in an exception).
30	* 2) then to order the code to maximize the number of dual issue and
31	* completion opportunities without increasing the number of cache lines
32	* used in the same cases.
33	*
34	* The last goal of this code is to fit inside the address range
35	* assigned to the interrupt vectors: 192 instructions with fixed
36	* entry points every 64 instructions.
37	*
38	* Some typos have also been corrected and the Power l (lowercase L)
39	* instructions replaced by lwz without comment.
40	*
41	* I have attempted to describe the reasons of the order and of the choice
42	* of the instructions but the comments may be hard to understand without
43	* the processor manual.
44	*
45	* Note that the fact that the TLB are reloaded by software in theory
46	* allows tremendous flexibility, for example we could avoid setting the
47	* reference bit of the PTE which will could actually not be accessed because
48	* of protection violation by changing a few lines of code. However,
49	* this would significantly slow down most TLB reload operations, and
50	* this is the reason for which we try never to make checks which would be
51	* redundant with hardware and usually indicate a bug in a program.
52	*
53	* There are some inconsistencies in the documentation concerning the
54	* settings of SRR1 bit 15. All recent documentations say now that it is set
55	* for stores and cleared for loads. Anyway this handler never uses this bit.
56	*
57	* A final remark, the rfi instruction seems to implicitly clear the
58	* MSR<14> (tgpr)bit. The documentation claims that this bit is restored
59	* from SRR1 by rfi, but the corresponding bit in SRR1 is the LRU way bit.
60	* Anyway, the only exception which can occur while TGPR is set is a machine
61	* check which would indicate an unrecoverable problem. Recent documentation
62	* now says in some place that rfi clears MSR<14>.
63	*
64	* TLB software load for 602/603/603e/603ev:
65	* Specific Instructions:
66	* tlbld - write the dtlb with the pte in rpa reg
67	* tlbli - write the itlb with the pte in rpa reg
68	* Specific SPRs:
69	* dmiss - address of dstream miss
70	* imiss - address of istream miss
71	* hash1 - address primary hash PTEG address
72	* hash2 - returns secondary hash PTEG address
73	* iCmp - returns the primary istream compare value
74	* dCmp - returns the primary dstream compare value
75	* rpa - the second word of pte used by tlblx
76	* Other specific resources:
77	* cr0 saved in 4 high order bits of SRR1,
78	* SRR1 bit 14 [WAY] selects TLB set to load from LRU algorithm
79	* gprs r0..r3 shadowed by the setting of MSR bit 14 [TGPR]
80	* other bits in SRR1 (unused by this handler but see earlier comments)
81	*
82	* There are three basic flows corresponding to three vectors:
83	* 0x1000: Instruction TLB miss,
84	* 0x1100: Data TLB miss on load,
85	* 0x1200: Data TLB miss on store or not dirty page
86	*/
87
88	/* define the following if code does not have to run on basic 603 */
89	/* #define USE_KEY_BIT */
90
91	/* define the following for safe multiprocessing */
92	/* #define MULTIPROCESSING */
93
94	/* define the following for mixed endian */
95	/* #define CHECK_MIXED_ENDIAN */
96
97	/* define the following if entries always have the reference bit set */
98	#define ASSUME_REF_SET
99
100	/* Some OS kernels may want to keep a single copy of the dirty bit in a per
101	* page table. In this case writable pages are always write-protected as long
102	* as they are clean, and the dirty bit set actually means that the page
103	* is writable.
104	*/
105	#define DIRTY_MEANS_WRITABLE
106
107	#include <libcpu/cpu.h>
108	#include "asm.h"
109	#include "bootldr.h"
110
111	/*
112	* Instruction TLB miss flow
113	* Entry at 0x1000 with the following:
114	* srr0 -> address of instruction that missed
115	* srr1 -> 0:3=cr0, 13=1 (instruction), 14=lru way, 16:31=saved MSR
116	* msr<tgpr> -> 1
117	* iMiss -> ea that missed
118	* iCmp -> the compare value for the va that missed
119	* hash1 -> pointer to first hash pteg
120	* hash2 -> pointer to second hash pteg
121	*
122	* Register usage:
123	* r0 is limit address during search / scratch after
124	* r1 is pte data / error code for ISI exception when search fails
125	* r2 is pointer to pte
126	* r3 is compare value during search / scratch after
127	*/
128	/* Binutils or assembler bug ? Declaring the section executable and writable
129	* generates an error message on the @fixup entries.
130	*/
131	.section .exception,"aw"
132	# .org 0x1000 # instruction TLB miss entry point
133	.globl tlb_handlers
134	tlb_handlers:
135	.type tlb_handlers,@function
136	#define ISIVec tlb_handlers-0x1000+0x400
137	#define DSIVec tlb_handlers-0x1000+0x300
138	mfspr r2,HASH1
139	lwz r1,0(r2) # Start memory access as soon as possible
140	mfspr r3,ICMP # to load the cache.
141	0: la r0,48(r2) # Use explicit loop to avoid using ctr
142	1: cmpw r1,r3 # In theory the loop is somewhat slower
143	beq- 2f # than documentation example
144	cmpw r0,r2 # but we gain from starting cache load
145	lwzu r1,8(r2) # earlier and using slots between load
146	bne+ 1b # and comparison for other purposes.
147	cmpw r1,r3
148	bne- 4f # Secondary hash check
149	2: lwz r1,4(r2) # Found: load second word of PTE
150	mfspr r0,IMISS # get miss address during load delay
151	#ifdef ASSUME_REF_SET
152	andi. r3,r1,8 # check for guarded memory
153	bne- 5f
154	mtspr RPA,r1
155	mfsrr1 r3
156	tlbli r0
157	#else
158	/* This is basically the original code from the manual. */
159	# andi. r3,r1,8 # check for guarded memory
160	# bne- 5f
161	# andi. r3,r1,0x100 # check R bit ahead to help folding
162	/* However there is a better solution: these last three instructions can be
163	replaced by the following which should cause less pipeline stalls because
164	both tests are combined and there is a single CR rename buffer */
165	extlwi r3,r1,6,23 # Keep only RCWIMG in 6 most significant bits.
166	rlwinm. r3,r3,5,0,27 # Keep only G (in sign) and R and test.
167	blt- 5f # Negative means guarded, zero R not set.
168	mfsrr1 r3 # get saved cr0 bits now to dual issue
169	ori r1,r1,0x100
170	mtspr RPA,r1
171	tlbli r0
172	/* Do not update PTE if R bit already set, this will save one cache line
173	writeback at a later time, and avoid even more bus traffic in
174	multiprocessing systems, when several processors access the same PTEGs.
175	We also hope that the reference bit will be already set. */
176	bne+ 3f
177	#ifdef MULTIPROCESSING
178	srwi r1,r1,8 # get byte 7 of pte
179	stb r1,+6(r2) # update page table
180	#else
181	sth r1,+6(r2) # update page table
182	#endif
183	#endif
184	3: mtcrf 0x80,r3 # restore CR0
185	rfi # return to executing program
186
187	/* The preceding code is 20 to 25 instructions long, which occupies
188	3 or 4 cache lines. */
189	4: andi. r0,r3,0x0040 # see if we have done second hash
190	lis r1,0x4000 # set up error code in case next branch taken
191	bne- 6f # speculatively issue the following
192	mfspr r2,HASH2 # get the second pointer
193	ori r3,r3,0x0040 # change the compare value
194	lwz r1,0(r2) # load first entry
195	b 0b # and go back to main loop
196	/* We are now at 27 to 32 instructions, using 3 or 4 cache lines for all
197	cases in which the TLB is successfully loaded. */
198
199	/* Guarded memory protection violation: synthesize an ISI exception. */
200	5: lis r1,0x1000 # set srr1<3>=1 to flag guard violation
201	/* Entry Not Found branches here with r1 correctly set. */
202	6: mfsrr1 r3
203	mfmsr r0
204	insrwi r1,r3,16,16 # build srr1 for ISI exception
205	mtsrr1 r1 # set srr1
206	/* It seems few people have realized rlwinm can be used to clear a bit or
207	a field of contiguous bits in a register by setting mask_begin>mask_end. */
208	rlwinm r0,r0,0,15,13 # clear the msr<tgpr> bit
209	mtcrf 0x80, r3 # restore CR0
210	mtmsr r0 # flip back to the native gprs
211	isync # Required from 602 doc!
212	b ISIVec # go to instruction access exception
213	/* Up to now there are 37 to 42 instructions so at least 20 could be
214	inserted for complex cases or for statistics recording. */
215
216
217	/*
218	Data TLB miss on load flow
219	Entry at 0x1100 with the following:
220	srr0 -> address of instruction that caused the miss
221	srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=0, 16:31=saved MSR
222	msr<tgpr> -> 1
223	dMiss -> ea that missed
224	dCmp -> the compare value for the va that missed
225	hash1 -> pointer to first hash pteg
226	hash2 -> pointer to second hash pteg
227
228	Register usage:
229	r0 is limit address during search / scratch after
230	r1 is pte data / error code for DSI exception when search fails
231	r2 is pointer to pte
232	r3 is compare value during search / scratch after
233	*/
234	.org tlb_handlers+0x100
235	mfspr r2,HASH1
236	lwz r1,0(r2) # Start memory access as soon as possible
237	mfspr r3,DCMP # to load the cache.
238	0: la r0,48(r2) # Use explicit loop to avoid using ctr
239	1: cmpw r1,r3 # In theory the loop is somewhat slower
240	beq- 2f # than documentation example
241	cmpw r0,r2 # but we gain from starting cache load
242	lwzu r1,8(r2) # earlier and using slots between load
243	bne+ 1b # and comparison for other purposes.
244	cmpw r1,r3
245	bne- 4f # Secondary hash check
246	2: lwz r1,4(r2) # Found: load second word of PTE
247	mfspr r0,DMISS # get miss address during load delay
248	#ifdef ASSUME_REF_SET
249	mtspr RPA,r1
250	mfsrr1 r3
251	tlbld r0
252	#else
253	andi. r3,r1,0x100 # check R bit ahead to help folding
254	mfsrr1 r3 # get saved cr0 bits now to dual issue
255	ori r1,r1,0x100
256	mtspr RPA,r1
257	tlbld r0
258	/* Do not update PTE if R bit already set, this will save one cache line
259	writeback at a later time, and avoid even more bus traffic in
260	multiprocessing systems, when several processors access the same PTEGs.
261	We also hope that the reference bit will be already set. */
262	bne+ 3f
263	#ifdef MULTIPROCESSING
264	srwi r1,r1,8 # get byte 7 of pte
265	stb r1,+6(r2) # update page table
266	#else
267	sth r1,+6(r2) # update page table
268	#endif
269	#endif
270	3: mtcrf 0x80,r3 # restore CR0
271	rfi # return to executing program
272
273	/* The preceding code is 18 to 23 instructions long, which occupies
274	3 cache lines. */
275	4: andi. r0,r3,0x0040 # see if we have done second hash
276	lis r1,0x4000 # set up error code in case next branch taken
277	bne- 9f # speculatively issue the following
278	mfspr r2,HASH2 # get the second pointer
279	ori r3,r3,0x0040 # change the compare value
280	lwz r1,0(r2) # load first entry asap
281	b 0b # and go back to main loop
282	/* We are now at 25 to 30 instructions, using 3 or 4 cache lines for all
283	cases in which the TLB is successfully loaded. */
284
285
286	/*
287	Data TLB miss on store or not dirty page flow
288	Entry at 0x1200 with the following:
289	srr0 -> address of instruction that caused the miss
290	srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=1, 16:31=saved MSR
291	msr<tgpr> -> 1
292	dMiss -> ea that missed
293	dCmp -> the compare value for the va that missed
294	hash1 -> pointer to first hash pteg
295	hash2 -> pointer to second hash pteg
296
297	Register usage:
298	r0 is limit address during search / scratch after
299	r1 is pte data / error code for DSI exception when search fails
300	r2 is pointer to pte
301	r3 is compare value during search / scratch after
302	*/
303	.org tlb_handlers+0x200
304	mfspr r2,HASH1
305	lwz r1,0(r2) # Start memory access as soon as possible
306	mfspr r3,DCMP # to load the cache.
307	0: la r0,48(r2) # Use explicit loop to avoid using ctr
308	1: cmpw r1,r3 # In theory the loop is somewhat slower
309	beq- 2f # than documentation example
310	cmpw r0,r2 # but we gain from starting cache load
311	lwzu r1,8(r2) # earlier and using slots between load
312	bne+ 1b # and comparison for other purposes.
313	cmpw r1,r3
314	bne- 4f # Secondary hash check
315	2: lwz r1,4(r2) # Found: load second word of PTE
316	mfspr r0,DMISS # get miss address during load delay
317	/* We could simply set the C bit and then rely on hardware to flag protection
318	violations. This raises the problem that a page which actually has not been
319	modified may be marked as dirty and violates the OEA model for guaranteed
320	bit settings (table 5-8 of 603eUM.pdf). This can have harmful consequences
321	on operating system memory management routines, and play havoc with copy on
322	write schemes. So the protection check is ABSOLUTELY necessary. */
323	andi. r3,r1,0x80 # check C bit
324	beq- 5f # if (C==0) go to check protection
325	3: mfsrr1 r3 # get the saved cr0 bits
326	mtspr RPA,r1 # set the pte
327	tlbld r0 # load the dtlb
328	mtcrf 0x80,r3 # restore CR0
329	rfi # return to executing program
330	/* The preceding code is 20 instructions long, which occupy
331	3 cache lines. */
332	4: andi. r0,r3,0x0040 # see if we have done second hash
333	lis r1,0x4200 # set up error code in case next branch taken
334	bne- 9f # speculatively issue the following
335	mfspr r2,HASH2 # get the second pointer
336	ori r3,r3,0x0040 # change the compare value
337	lwz r1,0(r2) # load first entry asap
338	b 0b # and go back to main loop
339	/* We are now at 27 instructions, using 3 or 4 cache lines for all
340	cases in which the TLB C bit is already set. */
341
342	#ifdef DIRTY_MEANS_WRITABLE
343	5: lis r1,0x0A00 # protection violation on store
344	#else
345	/*
346	Entry found and C==0: check protection before setting C:
347	Register usage:
348	r0 is dMiss register
349	r1 is PTE entry (to be copied to RPA if success)
350	r2 is pointer to pte
351	r3 is trashed
352
353	For the 603e, the key bit in SRR1 helps to decide whether there is a
354	protection violation. However the way the check is done in the manual is
355	not very efficient. The code shown here works as well for 603 and 603e and
356	is much more efficient for the 603 and comparable to the manual example
357	for 603e. This code however has quite a bad structure due to the fact it
358	has been reordered to speed up the most common cases.
359	*/
360	/* The first of the following two instructions could be replaced by
361	andi. r3,r1,3 but it would compete with cmplwi for cr0 resource. */
362	5: clrlwi r3,r1,30 # Extract two low order bits
363	cmplwi r3,2 # Test for PP=10
364	bne- 7f # assume fallthrough is more frequent
365	6: ori r1,r1,0x180 # set referenced and changed bit
366	sth r1,6(r2) # update page table
367	b 3b # and finish loading TLB
368	/* We are now at 33 instructions, using 5 cache lines. */
369	7: bgt- 8f # if PP=11 then DSI protection exception
370	/* This code only works if key bit is present (602/603e/603ev) */
371	#ifdef USE_KEY_BIT
372	mfsrr1 r3 # get the KEY bit and test it
373	andis. r3,r3,0x0008
374	beq 6b # default prediction taken, truly better ?
375	#else
376	/* This code is for all 602 and 603 family models: */
377	mfsrr1 r3 # Here the trick is to use the MSR PR bit as a
378	mfsrin r0,r0 # shift count for an rlwnm. instruction which
379	extrwi r3,r3,1,17 # extracts and tests the correct key bit from
380	rlwnm. r3,r0,r3,1,1 # the segment register. RISC they said...
381	mfspr r0,DMISS # Restore fault address to r0
382	beq 6b # if 0 load tlb else protection fault
383	#endif
384	/* We are now at 40 instructions, (37 if using key bit), using 5 cache
385	lines in all cases in which the C bit is successfully set */
386	8: lis r1,0x0A00 # protection violation on store
387	#endif /* DIRTY_IS_WRITABLE */
388	/* PTE entry not found branch here with DSISR code in r1 */
389	9: mfsrr1 r3
390	mtdsisr r1
391	clrlwi r2,r3,16 # set up srr1 for DSI exception
392	mfmsr r0
393	/* I have some doubts about the usefulness of the xori instruction in
394	mixed or pure little-endian environment. The address is in the same
395	doubleword, hence in the same protection domain and performing an exclusive
396	or with 7 is only valid for byte accesses. */
397	#ifdef CHECK_MIXED_ENDIAN
398	andi. r1,r2,1 # test LE bit ahead to help folding
399	#endif
400	mtsrr1 r2
401	rlwinm r0,r0,0,15,13 # clear the msr<tgpr> bit
402	mfspr r1,DMISS # get miss address
403	#ifdef CHECK_MIXED_ENDIAN
404	beq 1f # if little endian then:
405	xori r1,r1,0x07 # de-mung the data address
406	1:
407	#endif
408	mtdar r1 # put in dar
409	mtcrf 0x80,r3 # restore CR0
410	mtmsr r0 # flip back to the native gprs
411	isync # required from 602 manual
412	b DSIVec # branch to DSI exception
413	/* We are now between 50 and 56 instructions. Close to the limit
414	but should be sufficient in case bugs are found. */
415	/* Altogether the three handlers occupy 128 instructions in the worst
416	case, 64 instructions could still be added (non contiguously). */
417	.org tlb_handlers+0x300
418	.globl _handler_glue
419	_handler_glue:
420	/* Entry code for exceptions: DSI (0x300), ISI(0x400), alignment(0x600) and
421	* traps(0x700). In theory it is not necessary to save and restore r13 and all
422	* higher numbered registers, but it is done because it allowed to call the
423	* firmware (PPCBug) for debugging in the very first stages when writing the
424	* bootloader.
425	*/
426	stwu r1,-160(r1)
427	stw r0,save_r(0)
428	mflr r0
429	stmw r2,save_r(2)
430	bl 0f
431	0: mfctr r4
432	stw r0,save_lr
433	mflr r9 /* Interrupt vector + few instructions */
434	la r10,160(r1)
435	stw r4,save_ctr
436	mfcr r5
437	lwz r8,2f-0b(r9)
438	mfxer r6
439	stw r5,save_cr
440	mtctr r8
441	stw r6,save_xer
442	mfsrr0 r7
443	stw r10,save_r(1)
444	mfsrr1 r8
445	stw r7,save_nip
446	la r4,8(r1)
447	lwz r13,1f-0b(r9)
448	rlwinm r3,r9,24,0x3f /* Interrupt vector >> 8 */
449	stw r8,save_msr
450	bctrl
451
452	lwz r7,save_msr
453	lwz r6,save_nip
454	mtsrr1 r7
455	lwz r5,save_xer
456	mtsrr0 r6
457	lwz r4,save_ctr
458	mtxer r5
459	lwz r3,save_lr
460	mtctr r4
461	lwz r0,save_cr
462	mtlr r3
463	lmw r2,save_r(2)
464	mtcr r0
465	lwz r0,save_r(0)
466	la r1,160(r1)
467	rfi
468	1: .long (__bd)@fixup
469	2: .long (_handler)@fixup
470	.section .fixup,"aw"
471	.align 2
472	.long 1b, 2b
473	.previous

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format