source: rtems/c/src/lib/libbsp/powerpc/shared/bootloader/exception.S @ c499856

4.115
Last change on this file since c499856 was c499856, checked in by Chris Johns <chrisj@…>, on 03/20/14 at 21:10:47

Change all references of rtems.com to rtems.org.

  • Property mode set to 100644
File size: 18.7 KB
RevLine 
[acc25ee]1/*
2 *  exception.S -- Exception handlers for early boot.
3 *
4 *  Copyright (C) 1998, 1999 Gabriel Paubert, paubert@iram.es
5 *
6 *  Modified to compile in RTEMS development environment
7 *  by Eric Valette
8 *
9 *  Copyright (C) 1999 Eric Valette. valette@crf.canon.fr
10 *
11 *  The license and distribution terms for this file may be
[0c875c6a]12 *  found in the file LICENSE in this distribution or at
[c499856]13 *  http://www.rtems.org/license/LICENSE.
[acc25ee]14 */
15
16/* This is an improved version of the TLB interrupt handling code from
[6128a4a]17 * the 603e users manual (603eUM.pdf) downloaded from the WWW. All the
18 * visible bugs have been removed. Note that many have survived in the errata
19 * to the 603 user manual (603UMer.pdf).
20 *
[acc25ee]21 *  This code also pays particular attention to optimization, takes into
22 * account the differences between 603 and 603e, single/multiple processor
23 * systems and tries to order instructions for dual dispatch in many places.
[6128a4a]24 *
[acc25ee]25 *  The optimization has been performed along two lines:
26 * 1) to minimize the number of instruction cache lines needed for the most
27 *    common execution paths (the ones that do not result in an exception).
[6128a4a]28 * 2) then to order the code to maximize the number of dual issue and
29 *    completion opportunities without increasing the number of cache lines
[acc25ee]30 *    used in the same cases.
[6128a4a]31 *
[acc25ee]32 *  The last goal of this code is to fit inside the address range
33 * assigned to the interrupt vectors: 192 instructions with fixed
34 * entry points every 64 instructions.
[6128a4a]35 *
[acc25ee]36 *  Some typos have also been corrected and the Power l (lowercase L)
37 * instructions replaced by lwz without comment.
[6128a4a]38 *
[acc25ee]39 *  I have attempted to describe the reasons of the order and of the choice
40 * of the instructions but the comments may be hard to understand without
41 * the processor manual.
[6128a4a]42 *
[acc25ee]43 *  Note that the fact that the TLB are reloaded by software in theory
[6128a4a]44 * allows tremendous flexibility, for example we could avoid setting the
[acc25ee]45 * reference bit of the PTE which will could actually not be accessed because
[6128a4a]46 * of protection violation by changing a few lines of code. However,
[acc25ee]47 * this would significantly slow down most TLB reload operations, and
48 * this is the reason for which we try never to make checks which would be
49 * redundant with hardware and usually indicate a bug in a program.
[6128a4a]50 *
[acc25ee]51 *  There are some inconsistencies in the documentation concerning the
[6128a4a]52 * settings of SRR1 bit 15. All recent documentations say now that it is set
[acc25ee]53 * for stores and cleared for loads. Anyway this handler never uses this bit.
[6128a4a]54 *
[acc25ee]55 *  A final remark, the rfi instruction seems to implicitly clear the
56 * MSR<14> (tgpr)bit. The documentation claims that this bit is restored
57 * from SRR1 by rfi, but the corresponding bit in SRR1 is the LRU way bit.
58 * Anyway, the only exception which can occur while TGPR is set is a machine
59 * check which would indicate an unrecoverable problem. Recent documentation
[6128a4a]60 * now says in some place that rfi clears MSR<14>.
61 *
62 *  TLB software load for 602/603/603e/603ev:
63 *    Specific Instructions:
64 *      tlbld - write the dtlb with the pte in rpa reg
65 *      tlbli - write the itlb with the pte in rpa reg
66 *    Specific SPRs:
67 *      dmiss - address of dstream miss
[acc25ee]68 *      imiss - address of istream miss
[6128a4a]69 *      hash1 - address primary hash PTEG address
70 *      hash2 - returns secondary hash PTEG address
71 *      iCmp - returns the primary istream compare value
72 *      dCmp - returns the primary dstream compare value
[acc25ee]73 *      rpa - the second word of pte used by tlblx
[6128a4a]74 *    Other specific resources:
[acc25ee]75 *      cr0 saved in 4 high order bits of SRR1,
[6128a4a]76 *      SRR1 bit 14 [WAY] selects TLB set to load from LRU algorithm
77 *      gprs r0..r3 shadowed by the setting of MSR bit 14 [TGPR]
[acc25ee]78 *      other bits in SRR1 (unused by this handler but see earlier comments)
[6128a4a]79 *
[acc25ee]80 *    There are three basic flows corresponding to three vectors:
[6128a4a]81 *      0x1000: Instruction TLB miss,
[acc25ee]82 *      0x1100: Data TLB miss on load,
[6128a4a]83 *      0x1200: Data TLB miss on store or not dirty page
[acc25ee]84 */
[6128a4a]85
[acc25ee]86/* define the following if code does not have to run on basic 603 */
87/* #define USE_KEY_BIT */
[6128a4a]88
[acc25ee]89/* define the following for safe multiprocessing */
[6128a4a]90/* #define MULTIPROCESSING */
[acc25ee]91
92/* define the following for mixed endian */
93/* #define CHECK_MIXED_ENDIAN */
94
95/* define the following if entries always have the reference bit set */
96#define ASSUME_REF_SET
97
98/* Some OS kernels may want to keep a single copy of the dirty bit in a per
99 * page table. In this case writable pages are always write-protected as long
100 * as they are clean, and the dirty bit set actually means that the page
[6128a4a]101 * is writable.
[acc25ee]102 */
[6128a4a]103#define DIRTY_MEANS_WRITABLE
104
[b7768c55]105#include <rtems/asm.h>
[cd35cf9]106#include <rtems/score/cpu.h>
[acc25ee]107#include "bootldr.h"
108
[6128a4a]109/*
110 * Instruction TLB miss flow
111 *   Entry at 0x1000 with the following:
112 *     srr0 -> address of instruction that missed
113 *     srr1 -> 0:3=cr0, 13=1 (instruction), 14=lru way, 16:31=saved MSR
114 *     msr<tgpr> -> 1
115 *     iMiss -> ea that missed
116 *     iCmp -> the compare value for the va that missed
[acc25ee]117 *     hash1 -> pointer to first hash pteg
[6128a4a]118 *     hash2 -> pointer to second hash pteg
[acc25ee]119 *
[6128a4a]120 *   Register usage:
121 *     r0 is limit address during search / scratch after
[acc25ee]122 *     r1 is pte data / error code for ISI exception when search fails
[6128a4a]123 *     r2 is pointer to pte
[acc25ee]124 *     r3 is compare value during search / scratch after
125 */
126/* Binutils or assembler bug ? Declaring the section executable and writable
127 * generates an error message on the @fixup entries.
128 */
[6128a4a]129        .section .exception,"aw"
[acc25ee]130#       .org    0x1000        # instruction TLB miss entry point
131        .globl  tlb_handlers
132tlb_handlers:
133        .type   tlb_handlers,@function
134#define ISIVec tlb_handlers-0x1000+0x400
135#define DSIVec tlb_handlers-0x1000+0x300
[6128a4a]136        mfspr   r2,HASH1
[acc25ee]137        lwz     r1,0(r2)      # Start memory access as soon as possible
[6128a4a]138        mfspr   r3,ICMP       # to load the cache.
[acc25ee]1390:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
1401:      cmpw    r1,r3         # In theory the loop is somewhat slower
141        beq-    2f            # than documentation example
[6128a4a]142        cmpw    r0,r2         # but we gain from starting cache load
143        lwzu    r1,8(r2)      # earlier and using slots between load
144        bne+    1b            # and comparison for other purposes.
[acc25ee]145        cmpw    r1,r3
146        bne-    4f            # Secondary hash check
[6128a4a]1472:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]148        mfspr   r0,IMISS      # get miss address during load delay
149#ifdef ASSUME_REF_SET
150        andi.   r3,r1,8       # check for guarded memory
151        bne-    5f
[7e85bfbe]152        mtspr   PPC_RPA,r1
[acc25ee]153        mfsrr1  r3
154        tlbli   r0
155#else
156/* This is basically the original code from the manual. */
157#       andi.   r3,r1,8       # check for guarded memory
158#       bne-    5f
159#       andi.   r3,r1,0x100   # check R bit ahead to help folding
[6128a4a]160/* However there is a better solution: these last three instructions can be
161replaced by the following which should cause less pipeline stalls because
[acc25ee]162both tests are combined and there is a single CR rename buffer */
163        extlwi  r3,r1,6,23    # Keep only RCWIMG in 6 most significant bits.
[6128a4a]164        rlwinm. r3,r3,5,0,27  # Keep only G (in sign) and R and test.
165        blt-    5f            # Negative means guarded, zero R not set.
[acc25ee]166        mfsrr1  r3            # get saved cr0 bits now to dual issue
167        ori     r1,r1,0x100
[7e85bfbe]168        mtspr   PPC_RPA,r1
[acc25ee]169        tlbli   r0
170/* Do not update PTE if R bit already set, this will save one cache line
171writeback at a later time, and avoid even more bus traffic in
172multiprocessing systems, when several processors access the same PTEGs.
173We also hope that the reference bit will be already set. */
174        bne+    3f
[6128a4a]175#ifdef MULTIPROCESSING
[acc25ee]176        srwi    r1,r1,8       # get byte 7 of pte
177        stb     r1,+6(r2)     # update page table
178#else
179        sth     r1,+6(r2)     # update page table
180#endif
181#endif
1823:      mtcrf   0x80,r3       # restore CR0
183        rfi                   # return to executing program
[6128a4a]184
[acc25ee]185/* The preceding code is 20 to 25 instructions long, which occupies
1863 or 4 cache lines. */
1874:      andi.   r0,r3,0x0040  # see if we have done second hash
188        lis     r1,0x4000     # set up error code in case next branch taken
189        bne-    6f            # speculatively issue the following
190        mfspr   r2,HASH2      # get the second pointer
191        ori     r3,r3,0x0040  # change the compare value
192        lwz     r1,0(r2)      # load first entry
193        b       0b            # and go back to main loop
194/* We are now at 27 to 32 instructions, using 3 or 4 cache lines for all
[6128a4a]195cases in which the TLB is successfully loaded. */
[acc25ee]196
[6128a4a]197/* Guarded memory protection violation: synthesize an ISI exception. */
[acc25ee]1985:      lis     r1,0x1000     # set srr1<3>=1 to flag guard violation
199/* Entry Not Found branches here with r1 correctly set. */
2006:      mfsrr1  r3
201        mfmsr   r0
202        insrwi  r1,r3,16,16   # build srr1 for ISI exception
203        mtsrr1  r1            # set srr1
204/* It seems few people have realized rlwinm can be used to clear a bit or
205a field of contiguous bits in a register by setting mask_begin>mask_end. */
206        rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
207        mtcrf   0x80, r3      # restore CR0
208        mtmsr   r0            # flip back to the native gprs
209        isync                 # Required from 602 doc!
[6128a4a]210        b       ISIVec        # go to instruction access exception
211/* Up to now there are 37 to 42 instructions so at least 20 could be
212inserted for complex cases or for statistics recording. */
[acc25ee]213
214
[6128a4a]215/*
216  Data TLB miss on load flow
217    Entry at 0x1100 with the following:
218      srr0 -> address of instruction that caused the miss
219      srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=0, 16:31=saved MSR
220      msr<tgpr> -> 1
221      dMiss -> ea that missed
222      dCmp -> the compare value for the va that missed
[acc25ee]223      hash1 -> pointer to first hash pteg
[6128a4a]224      hash2 -> pointer to second hash pteg
225
226    Register usage:
227      r0 is limit address during search / scratch after
[acc25ee]228      r1 is pte data / error code for DSI exception when search fails
[6128a4a]229      r2 is pointer to pte
[acc25ee]230      r3 is compare value during search / scratch after
231*/
[6128a4a]232        .org    tlb_handlers+0x100
233        mfspr   r2,HASH1
[acc25ee]234        lwz     r1,0(r2)      # Start memory access as soon as possible
235        mfspr   r3,DCMP       # to load the cache.
2360:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
2371:      cmpw    r1,r3         # In theory the loop is somewhat slower
238        beq-    2f            # than documentation example
[6128a4a]239        cmpw    r0,r2         # but we gain from starting cache load
240        lwzu    r1,8(r2)      # earlier and using slots between load
241        bne+    1b            # and comparison for other purposes.
[acc25ee]242        cmpw    r1,r3
243        bne-    4f            # Secondary hash check
[6128a4a]2442:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]245        mfspr   r0,DMISS      # get miss address during load delay
246#ifdef ASSUME_REF_SET
[7e85bfbe]247        mtspr   PPC_RPA,r1
[acc25ee]248        mfsrr1  r3
249        tlbld   r0
250#else
251        andi.   r3,r1,0x100   # check R bit ahead to help folding
252        mfsrr1  r3            # get saved cr0 bits now to dual issue
253        ori     r1,r1,0x100
[7e85bfbe]254        mtspr   PPC_RPA,r1
[acc25ee]255        tlbld   r0
256/* Do not update PTE if R bit already set, this will save one cache line
257writeback at a later time, and avoid even more bus traffic in
258multiprocessing systems, when several processors access the same PTEGs.
259We also hope that the reference bit will be already set. */
260        bne+    3f
[6128a4a]261#ifdef MULTIPROCESSING
[acc25ee]262        srwi    r1,r1,8       # get byte 7 of pte
263        stb     r1,+6(r2)     # update page table
264#else
265        sth     r1,+6(r2)     # update page table
266#endif
267#endif
2683:      mtcrf   0x80,r3       # restore CR0
269        rfi                   # return to executing program
[6128a4a]270
[acc25ee]271/* The preceding code is 18 to 23 instructions long, which occupies
2723 cache lines. */
2734:      andi.   r0,r3,0x0040  # see if we have done second hash
274        lis     r1,0x4000     # set up error code in case next branch taken
275        bne-    9f            # speculatively issue the following
276        mfspr   r2,HASH2      # get the second pointer
277        ori     r3,r3,0x0040  # change the compare value
278        lwz     r1,0(r2)      # load first entry asap
279        b       0b            # and go back to main loop
280/* We are now at 25 to 30 instructions, using 3 or 4 cache lines for all
[6128a4a]281cases in which the TLB is successfully loaded. */
[acc25ee]282
283
[6128a4a]284/*
285  Data TLB miss on store or not dirty page flow
286    Entry at 0x1200 with the following:
287      srr0 -> address of instruction that caused the miss
288      srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=1, 16:31=saved MSR
289      msr<tgpr> -> 1
290      dMiss -> ea that missed
291      dCmp -> the compare value for the va that missed
[acc25ee]292      hash1 -> pointer to first hash pteg
[6128a4a]293      hash2 -> pointer to second hash pteg
294
295    Register usage:
296      r0 is limit address during search / scratch after
[acc25ee]297      r1 is pte data / error code for DSI exception when search fails
[6128a4a]298      r2 is pointer to pte
[acc25ee]299      r3 is compare value during search / scratch after
[6128a4a]300*/
[acc25ee]301        .org    tlb_handlers+0x200
[6128a4a]302        mfspr   r2,HASH1
[acc25ee]303        lwz     r1,0(r2)      # Start memory access as soon as possible
[6128a4a]304        mfspr   r3,DCMP       # to load the cache.
[acc25ee]3050:      la      r0,48(r2)     # Use explicit loop to avoid using ctr
3061:      cmpw    r1,r3         # In theory the loop is somewhat slower
307        beq-    2f            # than documentation example
[6128a4a]308        cmpw    r0,r2         # but we gain from starting cache load
309        lwzu    r1,8(r2)      # earlier and using slots between load
310        bne+    1b            # and comparison for other purposes.
[acc25ee]311        cmpw    r1,r3
312        bne-    4f            # Secondary hash check
[6128a4a]3132:      lwz     r1,4(r2)      # Found:  load second word of PTE
[acc25ee]314        mfspr   r0,DMISS      # get miss address during load delay
[6128a4a]315/* We could simply set the C bit and then rely on hardware to flag protection
316violations. This raises the problem that a page which actually has not been
317modified may be marked as dirty and violates the OEA model for guaranteed
318bit settings (table 5-8 of 603eUM.pdf). This can have harmful consequences
319on operating system memory management routines, and play havoc with copy on
[acc25ee]320write schemes. So the protection check is ABSOLUTELY necessary. */
321        andi.   r3,r1,0x80    # check C bit
[6128a4a]322        beq-    5f            # if (C==0) go to check protection
3233:      mfsrr1  r3            # get the saved cr0 bits
[7e85bfbe]324        mtspr   PPC_RPA,r1        # set the pte
[6128a4a]325        tlbld   r0            # load the dtlb
326        mtcrf   0x80,r3       # restore CR0
327        rfi                   # return to executing program
[acc25ee]328/* The preceding code is 20 instructions long, which occupy
[6128a4a]3293 cache lines. */
[acc25ee]3304:      andi.   r0,r3,0x0040  # see if we have done second hash
331        lis     r1,0x4200     # set up error code in case next branch taken
332        bne-    9f            # speculatively issue the following
333        mfspr   r2,HASH2      # get the second pointer
334        ori     r3,r3,0x0040  # change the compare value
335        lwz     r1,0(r2)      # load first entry asap
336        b       0b            # and go back to main loop
337/* We are now at 27 instructions, using 3 or 4 cache lines for all
338cases in which the TLB C bit is already set. */
339
340#ifdef DIRTY_MEANS_WRITABLE
3415:      lis     r1,0x0A00     # protection violation on store
342#else
[6128a4a]343/*
344  Entry found and C==0: check protection before setting C:
345    Register usage:
[acc25ee]346      r0 is dMiss register
[6128a4a]347      r1 is PTE entry (to be copied to RPA if success)
348      r2 is pointer to pte
349      r3 is trashed
[acc25ee]350
351    For the 603e, the key bit in SRR1 helps to decide whether there is a
352  protection violation. However the way the check is done in the manual is
353  not very efficient. The code shown here works as well for 603 and 603e and
354  is much more efficient for the 603 and comparable to the manual example
[6128a4a]355  for 603e. This code however has quite a bad structure due to the fact it
356  has been reordered to speed up the most common cases.
357*/
[acc25ee]358/* The first of the following two instructions could be replaced by
359andi. r3,r1,3 but it would compete with cmplwi for cr0 resource. */
3605:      clrlwi  r3,r1,30      # Extract two low order bits
361        cmplwi  r3,2          # Test for PP=10
362        bne-    7f            # assume fallthrough is more frequent
3636:      ori     r1,r1,0x180   # set referenced and changed bit
364        sth     r1,6(r2)      # update page table
365        b       3b            # and finish loading TLB
366/* We are now at 33 instructions, using 5 cache lines. */
3677:      bgt-    8f            # if PP=11 then DSI protection exception
368/* This code only works if key bit is present (602/603e/603ev) */
[6128a4a]369#ifdef USE_KEY_BIT
[acc25ee]370        mfsrr1  r3            # get the KEY bit and test it
371        andis.  r3,r3,0x0008
372        beq     6b            # default prediction taken, truly better ?
[6128a4a]373#else
[acc25ee]374/* This code is for all 602 and 603 family models: */
375        mfsrr1  r3            # Here the trick is to use the MSR PR bit as a
376        mfsrin  r0,r0         # shift count for an rlwnm. instruction which
377        extrwi  r3,r3,1,17    # extracts and tests the correct key bit from
378        rlwnm.  r3,r0,r3,1,1  # the segment register. RISC they said...
[6128a4a]379        mfspr   r0,DMISS      # Restore fault address to r0
[acc25ee]380        beq     6b            # if 0 load tlb else protection fault
381#endif
382/* We are now at 40 instructions, (37 if using key bit), using 5 cache
383lines in all cases in which the C bit is successfully set */
3848:      lis     r1,0x0A00     # protection violation on store
385#endif /* DIRTY_IS_WRITABLE */
[6128a4a]386/* PTE entry not found branch here with DSISR code in r1 */
[acc25ee]3879:      mfsrr1  r3
388        mtdsisr r1
[6128a4a]389        clrlwi  r2,r3,16      # set up srr1 for DSI exception
[acc25ee]390        mfmsr   r0
391/* I have some doubts about the usefulness of the xori instruction in
392mixed or pure little-endian environment. The address is in the same
393doubleword, hence in the same protection domain and performing an exclusive
394or with 7 is only valid for byte accesses. */
[6128a4a]395#ifdef CHECK_MIXED_ENDIAN
[acc25ee]396        andi.   r1,r2,1       # test LE bit ahead to help folding
397#endif
398        mtsrr1  r2
[6128a4a]399        rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
[acc25ee]400        mfspr   r1,DMISS      # get miss address
401#ifdef CHECK_MIXED_ENDIAN
[6128a4a]402        beq     1f            # if little endian then:
403        xori    r1,r1,0x07    # de-mung the data address
[acc25ee]4041:
[6128a4a]405#endif
406        mtdar   r1            # put in dar
407        mtcrf   0x80,r3       # restore CR0
[acc25ee]408        mtmsr   r0            # flip back to the native gprs
[6128a4a]409        isync                 # required from 602 manual
[acc25ee]410        b       DSIVec        # branch to DSI exception
411/* We are now between 50 and 56 instructions. Close to the limit
412but should be sufficient in case bugs are found. */
[6128a4a]413/* Altogether the three handlers occupy 128 instructions in the worst
[acc25ee]414case, 64 instructions could still be added (non contiguously). */
415        .org    tlb_handlers+0x300
416        .globl  _handler_glue
417_handler_glue:
418/* Entry code for exceptions: DSI (0x300), ISI(0x400), alignment(0x600) and
419 * traps(0x700). In theory it is not necessary to save and restore r13 and all
[6128a4a]420 * higher numbered registers, but it is done because it allowed to call the
421 * firmware (PPCBug) for debugging in the very first stages when writing the
[acc25ee]422 * bootloader.
423 */
424        stwu    r1,-160(r1)
425        stw     r0,save_r(0)
426        mflr    r0
427        stmw    r2,save_r(2)
428        bl      0f
4290:      mfctr   r4
430        stw     r0,save_lr
431        mflr    r9              /* Interrupt vector + few instructions */
432        la      r10,160(r1)
433        stw     r4,save_ctr
434        mfcr    r5
435        lwz     r8,2f-0b(r9)
436        mfxer   r6
437        stw     r5,save_cr
438        mtctr   r8
439        stw     r6,save_xer
440        mfsrr0  r7
441        stw     r10,save_r(1)
442        mfsrr1  r8
443        stw     r7,save_nip
444        la      r4,8(r1)
445        lwz     r13,1f-0b(r9)
446        rlwinm  r3,r9,24,0x3f   /* Interrupt vector >> 8 */
447        stw     r8,save_msr
448        bctrl
449
450        lwz     r7,save_msr
451        lwz     r6,save_nip
452        mtsrr1  r7
453        lwz     r5,save_xer
454        mtsrr0  r6
455        lwz     r4,save_ctr
456        mtxer   r5
457        lwz     r3,save_lr
458        mtctr   r4
459        lwz     r0,save_cr
460        mtlr    r3
461        lmw     r2,save_r(2)
462        mtcr    r0
463        lwz     r0,save_r(0)
464        la      r1,160(r1)
465        rfi
4661:      .long   (__bd)@fixup
4672:      .long   (_handler)@fixup
468        .section .fixup,"aw"
469        .align  2
470        .long 1b, 2b
471        .previous
Note: See TracBrowser for help on using the repository browser.