source: rtems/c/src/lib/libbsp/powerpc/shared/bootloader/exception.S @ 7e85bfbe

4.115
Last change on this file since 7e85bfbe was 7e85bfbe, checked in by Sebastian Huber <sebastian.huber@…>, on 08/24/11 at 09:48:56

2011-08-24 Sebastian Huber <sebastian.huber@…>

  • shared/bootloader/exception.S, shared/bootloader/misc.c, shared/bootloader/mm.c, shared/console/polled_io.c, shared/startup/probeMemEnd.c: Update due to API changes.
  • Property mode set to 100644
File size: 18.7 KB
RevLine 
[acc25ee]1/*
2 *  exception.S -- Exception handlers for early boot.
3 *
4 *  Copyright (C) 1998, 1999 Gabriel Paubert, paubert@iram.es
5 *
6 *  Modified to compile in RTEMS development environment
7 *  by Eric Valette
8 *
9 *  Copyright (C) 1999 Eric Valette. valette@crf.canon.fr
10 *
11 *  The license and distribution terms for this file may be
[0c875c6a]12 *  found in the file LICENSE in this distribution or at
[e831de8]13 *  http://www.rtems.com/license/LICENSE.
[acc25ee]14 *
15 * $Id$
16 */
17
18/* This is an improved version of the TLB interrupt handling code from
[6128a4a]19 * the 603e users manual (603eUM.pdf) downloaded from the WWW. All the
20 * visible bugs have been removed. Note that many have survived in the errata
21 * to the 603 user manual (603UMer.pdf).
22 *
[acc25ee]23 *  This code also pays particular attention to optimization, takes into
24 * account the differences between 603 and 603e, single/multiple processor
25 * systems and tries to order instructions for dual dispatch in many places.
[6128a4a]26 *
[acc25ee]27 *  The optimization has been performed along two lines:
28 * 1) to minimize the number of instruction cache lines needed for the most
29 *    common execution paths (the ones that do not result in an exception).
[6128a4a]30 * 2) then to order the code to maximize the number of dual issue and
31 *    completion opportunities without increasing the number of cache lines
[acc25ee]32 *    used in the same cases.
[6128a4a]33 *
[acc25ee]34 *  The last goal of this code is to fit inside the address range
35 * assigned to the interrupt vectors: 192 instructions with fixed
36 * entry points every 64 instructions.
[6128a4a]37 *
[acc25ee]38 *  Some typos have also been corrected and the Power l (lowercase L)
39 * instructions replaced by lwz without comment.
[6128a4a]40 *
[acc25ee]41 *  I have attempted to describe the reasons of the order and of the choice
42 * of the instructions but the comments may be hard to understand without
43 * the processor manual.
[6128a4a]44 *
[acc25ee]45 *  Note that the fact that the TLB are reloaded by software in theory
[6128a4a]46 * allows tremendous flexibility, for example we could avoid setting the
[acc25ee]47 * reference bit of the PTE which will could actually not be accessed because
[6128a4a]48 * of protection violation by changing a few lines of code. However,
[acc25ee]49 * this would significantly slow down most TLB reload operations, and
50 * this is the reason for which we try never to make checks which would be
51 * redundant with hardware and usually indicate a bug in a program.
[6128a4a]52 *
[acc25ee]53 *  There are some inconsistencies in the documentation concerning the
[6128a4a]54 * settings of SRR1 bit 15. All recent documentations say now that it is set
[acc25ee]55 * for stores and cleared for loads. Anyway this handler never uses this bit.
[6128a4a]56 *
[acc25ee]57 *  A final remark, the rfi instruction seems to implicitly clear the
58 * MSR<14> (tgpr)bit. The documentation claims that this bit is restored
59 * from SRR1 by rfi, but the corresponding bit in SRR1 is the LRU way bit.
60 * Anyway, the only exception which can occur while TGPR is set is a machine
61 * check which would indicate an unrecoverable problem. Recent documentation
[6128a4a]62 * now says in some place that rfi clears MSR<14>.
63 *
64 *  TLB software load for 602/603/603e/603ev:
65 *    Specific Instructions:
66 *      tlbld - write the dtlb with the pte in rpa reg
67 *      tlbli - write the itlb with the pte in rpa reg
68 *    Specific SPRs:
69 *      dmiss - address of dstream miss
[acc25ee]70 *      imiss - address of istream miss
[6128a4a]71 *      hash1 - address primary hash PTEG address
72 *      hash2 - returns secondary hash PTEG address
73 *      iCmp - returns the primary istream compare value
74 *      dCmp - returns the primary dstream compare value
[acc25ee]75 *      rpa - the second word of pte used by tlblx
[6128a4a]76 *    Other specific resources:
[acc25ee]77 *      cr0 saved in 4 high order bits of SRR1,
[6128a4a]78 *      SRR1 bit 14 [WAY] selects TLB set to load from LRU algorithm
79 *      gprs r0..r3 shadowed by the setting of MSR bit 14 [TGPR]
[acc25ee]80 *      other bits in SRR1 (unused by this handler but see earlier comments)
[6128a4a]81 *
[acc25ee]82 *    There are three basic flows corresponding to three vectors:
[6128a4a]83 *      0x1000: Instruction TLB miss,
[acc25ee]84 *      0x1100: Data TLB miss on load,
[6128a4a]85 *      0x1200: Data TLB miss on store or not dirty page
[acc25ee]86 */
[6128a4a]87
[acc25ee]88/* define the following if code does not have to run on basic 603 */
89/* #define USE_KEY_BIT */
[6128a4a]90
[acc25ee]91/* define the following for safe multiprocessing */
[6128a4a]92/* #define MULTIPROCESSING */
[acc25ee]93
94/* define the following for mixed endian */
95/* #define CHECK_MIXED_ENDIAN */
96
97/* define the following if entries always have the reference bit set */
98#define ASSUME_REF_SET
99
100/* Some OS kernels may want to keep a single copy of the dirty bit in a per
101 * page table. In this case writable pages are always write-protected as long
102 * as they are clean, and the dirty bit set actually means that the page
[6128a4a]103 * is writable.
[acc25ee]104 */
[6128a4a]105#define DIRTY_MEANS_WRITABLE
106
[b7768c55]107#include <rtems/asm.h>
[cd35cf9]108#include <rtems/score/cpu.h>
[acc25ee]109#include "bootldr.h"
110
[6128a4a]111/*
112 * Instruction TLB miss flow
113 *   Entry at 0x1000 with the following:
114 *     srr0 -> address of instruction that missed
115 *     srr1 -> 0:3=cr0, 13=1 (instruction), 14=lru way, 16:31=saved MSR
116 *     msr<tgpr> -> 1
117 *     iMiss -> ea that missed
118 *     iCmp -> the compare value for the va that missed
[acc25ee]119 *     hash1 -> pointer to first hash pteg
[6128a4a]120 *     hash2 -> pointer to second hash pteg
[acc25ee]121 *
[6128a4a]122 *   Register usage:
123 *     r0 is limit address during search / scratch after
[acc25ee]124 *     r1 is pte data / error code for ISI exception when search fails
[6128a4a]125 *     r2 is pointer to pte
[acc25ee]126 *     r3 is compare value during search / scratch after
127 */
128/* Binutils or assembler bug ? Declaring the section executable and writable
129 * generates an error message on the @fixup entries.
130 */
[6128a4a]131        .section .exception,"aw"
[acc25ee]132#       .org    0x1000        # instruction TLB miss entry point
133        .globl  tlb_handlers
134tlb_handlers:
135        .type   tlb_handlers,@function
136#define ISIVec tlb_handlers-0x1000+0x400
137#define DSIVec tlb_handlers-0x1000+0x300
[6128a4a]138        mfspr   r2,HASH1
[acc25ee]139        lwz     r1,0(r2)      # Start memory access as soon as possible
[6128a4a]140        mfspr   r3,ICMP       # to load the cache.
[acc25ee]1410:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
1421:      cmpw    r1,r3         # In theory the loop is somewhat slower
143        beq-    2f            # than documentation example
[6128a4a]144        cmpw    r0,r2         # but we gain from starting cache load
145        lwzu    r1,8(r2)      # earlier and using slots between load
146        bne+    1b            # and comparison for other purposes.
[acc25ee]147        cmpw    r1,r3
148        bne-    4f            # Secondary hash check
[6128a4a]1492:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]150        mfspr   r0,IMISS      # get miss address during load delay
151#ifdef ASSUME_REF_SET
152        andi.   r3,r1,8       # check for guarded memory
153        bne-    5f
[7e85bfbe]154        mtspr   PPC_RPA,r1
[acc25ee]155        mfsrr1  r3
156        tlbli   r0
157#else
158/* This is basically the original code from the manual. */
159#       andi.   r3,r1,8       # check for guarded memory
160#       bne-    5f
161#       andi.   r3,r1,0x100   # check R bit ahead to help folding
[6128a4a]162/* However there is a better solution: these last three instructions can be
163replaced by the following which should cause less pipeline stalls because
[acc25ee]164both tests are combined and there is a single CR rename buffer */
165        extlwi  r3,r1,6,23    # Keep only RCWIMG in 6 most significant bits.
[6128a4a]166        rlwinm. r3,r3,5,0,27  # Keep only G (in sign) and R and test.
167        blt-    5f            # Negative means guarded, zero R not set.
[acc25ee]168        mfsrr1  r3            # get saved cr0 bits now to dual issue
169        ori     r1,r1,0x100
[7e85bfbe]170        mtspr   PPC_RPA,r1
[acc25ee]171        tlbli   r0
172/* Do not update PTE if R bit already set, this will save one cache line
173writeback at a later time, and avoid even more bus traffic in
174multiprocessing systems, when several processors access the same PTEGs.
175We also hope that the reference bit will be already set. */
176        bne+    3f
[6128a4a]177#ifdef MULTIPROCESSING
[acc25ee]178        srwi    r1,r1,8       # get byte 7 of pte
179        stb     r1,+6(r2)     # update page table
180#else
181        sth     r1,+6(r2)     # update page table
182#endif
183#endif
1843:      mtcrf   0x80,r3       # restore CR0
185        rfi                   # return to executing program
[6128a4a]186
[acc25ee]187/* The preceding code is 20 to 25 instructions long, which occupies
1883 or 4 cache lines. */
1894:      andi.   r0,r3,0x0040  # see if we have done second hash
190        lis     r1,0x4000     # set up error code in case next branch taken
191        bne-    6f            # speculatively issue the following
192        mfspr   r2,HASH2      # get the second pointer
193        ori     r3,r3,0x0040  # change the compare value
194        lwz     r1,0(r2)      # load first entry
195        b       0b            # and go back to main loop
196/* We are now at 27 to 32 instructions, using 3 or 4 cache lines for all
[6128a4a]197cases in which the TLB is successfully loaded. */
[acc25ee]198
[6128a4a]199/* Guarded memory protection violation: synthesize an ISI exception. */
[acc25ee]2005:      lis     r1,0x1000     # set srr1<3>=1 to flag guard violation
201/* Entry Not Found branches here with r1 correctly set. */
2026:      mfsrr1  r3
203        mfmsr   r0
204        insrwi  r1,r3,16,16   # build srr1 for ISI exception
205        mtsrr1  r1            # set srr1
206/* It seems few people have realized rlwinm can be used to clear a bit or
207a field of contiguous bits in a register by setting mask_begin>mask_end. */
208        rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
209        mtcrf   0x80, r3      # restore CR0
210        mtmsr   r0            # flip back to the native gprs
211        isync                 # Required from 602 doc!
[6128a4a]212        b       ISIVec        # go to instruction access exception
213/* Up to now there are 37 to 42 instructions so at least 20 could be
214inserted for complex cases or for statistics recording. */
[acc25ee]215
216
[6128a4a]217/*
218  Data TLB miss on load flow
219    Entry at 0x1100 with the following:
220      srr0 -> address of instruction that caused the miss
221      srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=0, 16:31=saved MSR
222      msr<tgpr> -> 1
223      dMiss -> ea that missed
224      dCmp -> the compare value for the va that missed
[acc25ee]225      hash1 -> pointer to first hash pteg
[6128a4a]226      hash2 -> pointer to second hash pteg
227
228    Register usage:
229      r0 is limit address during search / scratch after
[acc25ee]230      r1 is pte data / error code for DSI exception when search fails
[6128a4a]231      r2 is pointer to pte
[acc25ee]232      r3 is compare value during search / scratch after
233*/
[6128a4a]234        .org    tlb_handlers+0x100
235        mfspr   r2,HASH1
[acc25ee]236        lwz     r1,0(r2)      # Start memory access as soon as possible
237        mfspr   r3,DCMP       # to load the cache.
2380:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
2391:      cmpw    r1,r3         # In theory the loop is somewhat slower
240        beq-    2f            # than documentation example
[6128a4a]241        cmpw    r0,r2         # but we gain from starting cache load
242        lwzu    r1,8(r2)      # earlier and using slots between load
243        bne+    1b            # and comparison for other purposes.
[acc25ee]244        cmpw    r1,r3
245        bne-    4f            # Secondary hash check
[6128a4a]2462:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]247        mfspr   r0,DMISS      # get miss address during load delay
248#ifdef ASSUME_REF_SET
[7e85bfbe]249        mtspr   PPC_RPA,r1
[acc25ee]250        mfsrr1  r3
251        tlbld   r0
252#else
253        andi.   r3,r1,0x100   # check R bit ahead to help folding
254        mfsrr1  r3            # get saved cr0 bits now to dual issue
255        ori     r1,r1,0x100
[7e85bfbe]256        mtspr   PPC_RPA,r1
[acc25ee]257        tlbld   r0
258/* Do not update PTE if R bit already set, this will save one cache line
259writeback at a later time, and avoid even more bus traffic in
260multiprocessing systems, when several processors access the same PTEGs.
261We also hope that the reference bit will be already set. */
262        bne+    3f
[6128a4a]263#ifdef MULTIPROCESSING
[acc25ee]264        srwi    r1,r1,8       # get byte 7 of pte
265        stb     r1,+6(r2)     # update page table
266#else
267        sth     r1,+6(r2)     # update page table
268#endif
269#endif
2703:      mtcrf   0x80,r3       # restore CR0
271        rfi                   # return to executing program
[6128a4a]272
[acc25ee]273/* The preceding code is 18 to 23 instructions long, which occupies
2743 cache lines. */
2754:      andi.   r0,r3,0x0040  # see if we have done second hash
276        lis     r1,0x4000     # set up error code in case next branch taken
277        bne-    9f            # speculatively issue the following
278        mfspr   r2,HASH2      # get the second pointer
279        ori     r3,r3,0x0040  # change the compare value
280        lwz     r1,0(r2)      # load first entry asap
281        b       0b            # and go back to main loop
282/* We are now at 25 to 30 instructions, using 3 or 4 cache lines for all
[6128a4a]283cases in which the TLB is successfully loaded. */
[acc25ee]284
285
[6128a4a]286/*
287  Data TLB miss on store or not dirty page flow
288    Entry at 0x1200 with the following:
289      srr0 -> address of instruction that caused the miss
290      srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=1, 16:31=saved MSR
291      msr<tgpr> -> 1
292      dMiss -> ea that missed
293      dCmp -> the compare value for the va that missed
[acc25ee]294      hash1 -> pointer to first hash pteg
[6128a4a]295      hash2 -> pointer to second hash pteg
296
297    Register usage:
298      r0 is limit address during search / scratch after
[acc25ee]299      r1 is pte data / error code for DSI exception when search fails
[6128a4a]300      r2 is pointer to pte
[acc25ee]301      r3 is compare value during search / scratch after
[6128a4a]302*/
[acc25ee]303        .org    tlb_handlers+0x200
[6128a4a]304        mfspr   r2,HASH1
[acc25ee]305        lwz     r1,0(r2)      # Start memory access as soon as possible
[6128a4a]306        mfspr   r3,DCMP       # to load the cache.
[acc25ee]3070:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
3081:      cmpw    r1,r3         # In theory the loop is somewhat slower
309        beq-    2f            # than documentation example
[6128a4a]310        cmpw    r0,r2         # but we gain from starting cache load
311        lwzu    r1,8(r2)      # earlier and using slots between load
312        bne+    1b            # and comparison for other purposes.
[acc25ee]313        cmpw    r1,r3
314        bne-    4f            # Secondary hash check
[6128a4a]3152:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]316        mfspr   r0,DMISS      # get miss address during load delay
[6128a4a]317/* We could simply set the C bit and then rely on hardware to flag protection
318violations. This raises the problem that a page which actually has not been
319modified may be marked as dirty and violates the OEA model for guaranteed
320bit settings (table 5-8 of 603eUM.pdf). This can have harmful consequences
321on operating system memory management routines, and play havoc with copy on
[acc25ee]322write schemes. So the protection check is ABSOLUTELY necessary. */
323        andi.   r3,r1,0x80    # check C bit
[6128a4a]324        beq-    5f            # if (C==0) go to check protection
3253:      mfsrr1  r3            # get the saved cr0 bits
[7e85bfbe]326        mtspr   PPC_RPA,r1        # set the pte
[6128a4a]327        tlbld   r0            # load the dtlb
328        mtcrf   0x80,r3       # restore CR0
329        rfi                   # return to executing program
[acc25ee]330/* The preceding code is 20 instructions long, which occupy
[6128a4a]3313 cache lines. */
[acc25ee]3324:      andi.   r0,r3,0x0040  # see if we have done second hash
333        lis     r1,0x4200     # set up error code in case next branch taken
334        bne-    9f            # speculatively issue the following
335        mfspr   r2,HASH2      # get the second pointer
336        ori     r3,r3,0x0040  # change the compare value
337        lwz     r1,0(r2)      # load first entry asap
338        b       0b            # and go back to main loop
339/* We are now at 27 instructions, using 3 or 4 cache lines for all
340cases in which the TLB C bit is already set. */
341
342#ifdef DIRTY_MEANS_WRITABLE
3435:      lis     r1,0x0A00     # protection violation on store
344#else
[6128a4a]345/*
346  Entry found and C==0: check protection before setting C:
347    Register usage:
[acc25ee]348      r0 is dMiss register
[6128a4a]349      r1 is PTE entry (to be copied to RPA if success)
350      r2 is pointer to pte
351      r3 is trashed
[acc25ee]352
353    For the 603e, the key bit in SRR1 helps to decide whether there is a
354  protection violation. However the way the check is done in the manual is
355  not very efficient. The code shown here works as well for 603 and 603e and
356  is much more efficient for the 603 and comparable to the manual example
[6128a4a]357  for 603e. This code however has quite a bad structure due to the fact it
358  has been reordered to speed up the most common cases.
359*/
[acc25ee]360/* The first of the following two instructions could be replaced by
361andi. r3,r1,3 but it would compete with cmplwi for cr0 resource. */
3625:      clrlwi  r3,r1,30      # Extract two low order bits
363        cmplwi  r3,2          # Test for PP=10
364        bne-    7f            # assume fallthrough is more frequent
3656:      ori     r1,r1,0x180   # set referenced and changed bit
366        sth     r1,6(r2)      # update page table
367        b       3b            # and finish loading TLB
368/* We are now at 33 instructions, using 5 cache lines. */
3697:      bgt-    8f            # if PP=11 then DSI protection exception
370/* This code only works if key bit is present (602/603e/603ev) */
[6128a4a]371#ifdef USE_KEY_BIT
[acc25ee]372        mfsrr1  r3            # get the KEY bit and test it
373        andis.  r3,r3,0x0008
374        beq     6b            # default prediction taken, truly better ?
[6128a4a]375#else
[acc25ee]376/* This code is for all 602 and 603 family models: */
377        mfsrr1  r3            # Here the trick is to use the MSR PR bit as a
378        mfsrin  r0,r0         # shift count for an rlwnm. instruction which
379        extrwi  r3,r3,1,17    # extracts and tests the correct key bit from
380        rlwnm.  r3,r0,r3,1,1  # the segment register. RISC they said...
[6128a4a]381        mfspr   r0,DMISS      # Restore fault address to r0
[acc25ee]382        beq     6b            # if 0 load tlb else protection fault
383#endif
384/* We are now at 40 instructions, (37 if using key bit), using 5 cache
385lines in all cases in which the C bit is successfully set */
3868:      lis     r1,0x0A00     # protection violation on store
387#endif /* DIRTY_IS_WRITABLE */
[6128a4a]388/* PTE entry not found branch here with DSISR code in r1 */
[acc25ee]3899:      mfsrr1  r3
390        mtdsisr r1
[6128a4a]391        clrlwi  r2,r3,16      # set up srr1 for DSI exception
[acc25ee]392        mfmsr   r0
393/* I have some doubts about the usefulness of the xori instruction in
394mixed or pure little-endian environment. The address is in the same
395doubleword, hence in the same protection domain and performing an exclusive
396or with 7 is only valid for byte accesses. */
[6128a4a]397#ifdef CHECK_MIXED_ENDIAN
[acc25ee]398        andi.   r1,r2,1       # test LE bit ahead to help folding
399#endif
400        mtsrr1  r2
[6128a4a]401        rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
[acc25ee]402        mfspr   r1,DMISS      # get miss address
403#ifdef CHECK_MIXED_ENDIAN
[6128a4a]404        beq     1f            # if little endian then:
405        xori    r1,r1,0x07    # de-mung the data address
[acc25ee]4061:
[6128a4a]407#endif
408        mtdar   r1            # put in dar
409        mtcrf   0x80,r3       # restore CR0
[acc25ee]410        mtmsr   r0            # flip back to the native gprs
[6128a4a]411        isync                 # required from 602 manual
[acc25ee]412        b       DSIVec        # branch to DSI exception
413/* We are now between 50 and 56 instructions. Close to the limit
414but should be sufficient in case bugs are found. */
[6128a4a]415/* Altogether the three handlers occupy 128 instructions in the worst
[acc25ee]416case, 64 instructions could still be added (non contiguously). */
417        .org    tlb_handlers+0x300
418        .globl  _handler_glue
419_handler_glue:
420/* Entry code for exceptions: DSI (0x300), ISI(0x400), alignment(0x600) and
421 * traps(0x700). In theory it is not necessary to save and restore r13 and all
[6128a4a]422 * higher numbered registers, but it is done because it allowed to call the
423 * firmware (PPCBug) for debugging in the very first stages when writing the
[acc25ee]424 * bootloader.
425 */
426        stwu    r1,-160(r1)
427        stw     r0,save_r(0)
428        mflr    r0
429        stmw    r2,save_r(2)
430        bl      0f
4310:      mfctr   r4
432        stw     r0,save_lr
433        mflr    r9              /* Interrupt vector + few instructions */
434        la      r10,160(r1)
435        stw     r4,save_ctr
436        mfcr    r5
437        lwz     r8,2f-0b(r9)
438        mfxer   r6
439        stw     r5,save_cr
440        mtctr   r8
441        stw     r6,save_xer
442        mfsrr0  r7
443        stw     r10,save_r(1)
444        mfsrr1  r8
445        stw     r7,save_nip
446        la      r4,8(r1)
447        lwz     r13,1f-0b(r9)
448        rlwinm  r3,r9,24,0x3f   /* Interrupt vector >> 8 */
449        stw     r8,save_msr
450        bctrl
451
452        lwz     r7,save_msr
453        lwz     r6,save_nip
454        mtsrr1  r7
455        lwz     r5,save_xer
456        mtsrr0  r6
457        lwz     r4,save_ctr
458        mtxer   r5
459        lwz     r3,save_lr
460        mtctr   r4
461        lwz     r0,save_cr
462        mtlr    r3
463        lmw     r2,save_r(2)
464        mtcr    r0
465        lwz     r0,save_r(0)
466        la      r1,160(r1)
467        rfi
4681:      .long   (__bd)@fixup
4692:      .long   (_handler)@fixup
470        .section .fixup,"aw"
471        .align  2
472        .long 1b, 2b
473        .previous
Note: See TracBrowser for help on using the repository browser.