source: rtems/cpukit/libblock/src/bdbuf.c @ 8e92db9

4.104.114.95
Last change on this file since 8e92db9 was e03c37a, checked in by Ralf Corsepius <ralf.corsepius@…>, on 08/25/08 at 15:07:58

s/rtems_boolean/bool/g.

  • Property mode set to 100644
File size: 72.4 KB
Line 
1/*
2 * Disk I/O buffering
3 * Buffer managment
4 *
5 * Copyright (C) 2001 OKTET Ltd., St.-Peterburg, Russia
6 * Author: Andrey G. Ivanov <Andrey.Ivanov@oktet.ru>
7 *         Victor V. Vengerov <vvv@oktet.ru>
8 *         Alexander Kukuta <kam@oktet.ru>
9 *
10 * Copyright (C) 2008 Chris Johns <chrisj@rtems.org>
11 *    Rewritten to remove score mutex access. Fixes many performance
12 *    issues.
13 *
14 * @(#) bdbuf.c,v 1.14 2004/04/17 08:15:17 ralf Exp
15 */
16
17/**
18 * @file
19 *
20 * The Buffer Descriptor Buffer code implements a cache between the disk
21 * devices and file systems. The code provides read ahead and write queuing to
22 * the drivers and fast cache look up using an AVL tree.
23 *
24 * The buffers are held in pools based on size. Each pool has buffers and the
25 * buffers follow this state machine:
26 *                                 
27 *                   read/read ahead
28 *          +-------------------------------+
29 *          |                               v
30 *     +-----------+ read ahead      +------------+
31 *     | READY,    |  complete       |            |---------+
32 *     |  READ     |<----------------|  TRANSFER  |         |
33 *     |   AHEAD   |   +-------------|            |<--+     |
34 *     +-----------+   | read/write  +------------+   |     |
35 *              | get  v complete                swap |     |
36 *              |    +-----------+ modified  +------------+ |
37 *              +--->| ACCESSED, |---------->|  MODIFIED, | |
38 *                   | ACCESSED  |<----------|  SYNC      | |
39 *              +----|  MODIFIED |<--+   get |            | |
40 *              |    +-----------+   |       +------------+ |
41 *              | release        get |                      |
42 *              |    +-----------+   |                      |
43 *              +--->|           |---+        read complete |
44 *                   |   CACHED  |           write complete |
45 *                   |           |<-------------------------+
46 *                   +-----------+
47 *         
48 * Empty buffers are added to the ready list and removed from this queue when a
49 * caller requests a buffer. This is referred to as getting a buffer in the
50 * code and the event get in the state diagram. The buffer is assigned to a
51 * block and inserted to the AVL based on the block/device key. If the block is
52 * to be read by the user and not in the cache (ready) it is transfered from
53 * the disk into memory. If no ready buffers exist the buffer is taken from the
54 * LRU list. If no buffers are on the LRU list the modified list is check. If
55 * no buffers are on the modified list the request blocks. If buffers are on
56 * the modified list the buffers hold timer is expired and the swap out task
57 * woken.
58 *
59 * A block being accessed is given to the file system layer and not accessable
60 * to another requester until released back to the cache. The same goes to a
61 * buffer in the transfer state. The transfer state means being read or
62 * written. If the file system has modifed the block and releases it as
63 * modified it placed on the pool's modified list and a hold timer
64 * initialised. The buffer is held for the hold time before being written to
65 * disk. Buffers are held for a configurable period of time on the modified
66 * list as a write sets the state to transfer and this locks the buffer out
67 * from the file system until the write complete. Buffers are often repeatable
68 * accessed and modified in a series of small updates so if sent to the disk
69 * when released as modified the user would have to block waiting until it had
70 * been written. This would be a performance problem.
71 *
72 * The code performs mulitple block reads and writes. Multiple block reads or
73 * read ahead increases performance with hardware that supports it. It also
74 * helps with a large cache as the disk head movement is reduced. It how-ever
75 * is a speculative operation so excessive use can remove valuable and needed
76 * blocks from the cache. The get call knows if a read is a for the file system
77 * or if it is a read ahead get. If the get is for a read ahead block and the
78 * block is already in the cache or no ready buffers are available the read
79 * ahead is stopped. The transfer occurs with the blocks so far. If a buffer is
80 * in the read ahead state and release it is placed on the ready list rather
81 * than the LRU list. This means these buffers are used before buffers used by
82 * the file system.
83 *
84 * The pool have the following lists of buffers:
85 *
86 *   ready        - Empty buffers created when the pool is initialised.
87 *   modified     - Buffers waiting to be written to disk.
88 *   sync         - Buffers to be synced to disk.
89 *   lru          - Accessed buffers released in least recently used order.
90 */
91
92/**
93 * Set to 1 to enable debug tracing.
94 */
95#define RTEMS_BDBUF_TRACE 0
96
97#if HAVE_CONFIG_H
98#include "config.h"
99#endif
100
101#include <rtems.h>
102#include <rtems/error.h>
103#include <limits.h>
104#include <errno.h>
105#include <assert.h>
106
107#if RTEMS_BDBUF_TRACE
108#include <stdio.h>
109#endif
110
111#include "rtems/bdbuf.h"
112
113/**
114 * The BD buffer context.
115 */
116typedef struct rtems_bdbuf_context {
117  rtems_bdbuf_pool* pool;      /*< Table of buffer pools */
118  int               npools;    /*< Number of entries in pool table */
119  rtems_id          swapout;   /*< Swapout task ID */
120  boolean           swapout_enabled;
121} rtems_bdbuf_context;
122
123/**
124 * Fatal errors
125 */
126#define RTEMS_BLKDEV_FATAL_ERROR(n) \
127  (((uint32_t)'B' << 24) | ((uint32_t)(n) & (uint32_t)0x00FFFFFF))
128
129#define RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY RTEMS_BLKDEV_FATAL_ERROR(1)
130#define RTEMS_BLKDEV_FATAL_BDBUF_SWAPOUT     RTEMS_BLKDEV_FATAL_ERROR(2)
131#define RTEMS_BLKDEV_FATAL_BDBUF_SYNC_LOCK   RTEMS_BLKDEV_FATAL_ERROR(3)
132#define RTEMS_BLKDEV_FATAL_BDBUF_SYNC_UNLOCK RTEMS_BLKDEV_FATAL_ERROR(4)
133#define RTEMS_BLKDEV_FATAL_BDBUF_POOL_LOCK   RTEMS_BLKDEV_FATAL_ERROR(5)
134#define RTEMS_BLKDEV_FATAL_BDBUF_POOL_UNLOCK RTEMS_BLKDEV_FATAL_ERROR(6)
135#define RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAIT   RTEMS_BLKDEV_FATAL_ERROR(7)
136#define RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAKE   RTEMS_BLKDEV_FATAL_ERROR(8)
137#define RTEMS_BLKDEV_FATAL_BDBUF_SO_WAKE     RTEMS_BLKDEV_FATAL_ERROR(9)
138#define RTEMS_BLKDEV_FATAL_BDBUF_SO_NOMEM    RTEMS_BLKDEV_FATAL_ERROR(10)
139#define BLKDEV_FATAL_BDBUF_SWAPOUT_RE        RTEMS_BLKDEV_FATAL_ERROR(11)
140#define BLKDEV_FATAL_BDBUF_SWAPOUT_TS        RTEMS_BLKDEV_FATAL_ERROR(12)
141
142/**
143 * The events used in this code. These should be system events rather than
144 * application events.
145 */
146#define RTEMS_BDBUF_TRANSFER_SYNC  RTEMS_EVENT_1
147#define RTEMS_BDBUF_SWAPOUT_SYNC   RTEMS_EVENT_2
148
149/**
150 * The swap out task size. Should be more than enough for most drivers with
151 * tracing turned on.
152 */
153#define SWAPOUT_TASK_STACK_SIZE (8 * 1024)
154
155/**
156 * Lock semaphore attributes. This is used for locking type mutexes.
157 *
158 * @warning Priority inheritance is on.
159 */
160#define RTEMS_BDBUF_POOL_LOCK_ATTRIBS \
161  (RTEMS_PRIORITY | RTEMS_BINARY_SEMAPHORE | \
162   RTEMS_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
163
164/**
165 * Waiter semaphore attributes.
166 *
167 * @warning Do not configure as inherit priority. If a driver is in the driver
168 *          initialisation table this locked semaphore will have the IDLE task
169 *          as the holder and a blocking task will raise the priority of the
170 *          IDLE task which can cause unsual side effects.
171 */
172#define RTEMS_BDBUF_POOL_WAITER_ATTRIBS \
173  (RTEMS_PRIORITY | RTEMS_SIMPLE_BINARY_SEMAPHORE | \
174   RTEMS_NO_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
175
176/*
177 * The swap out task.
178 */
179static rtems_task rtems_bdbuf_swapout_task(rtems_task_argument arg);
180
181/**
182 * The context of buffering layer.
183 */
184static rtems_bdbuf_context rtems_bdbuf_ctx;
185
186/**
187 * Print a message to the bdbuf trace output and flush it.
188 *
189 * @param format The format string. See printf for details.
190 * @param ... The arguments for the format text.
191 * @return int The number of bytes written to the output.
192 */
193#if RTEMS_BDBUF_TRACE
194boolean rtems_bdbuf_tracer;
195static void
196rtems_bdbuf_printf (const char *format, ...)
197{
198  va_list args;
199  va_start (args, format);
200  if (rtems_bdbuf_tracer)
201  {
202    fprintf (stdout, "bdbuf:");
203    vfprintf (stdout, format, args);
204    fflush (stdout);
205  }
206}
207#endif
208
209/**
210 * The default maximum height of 32 allows for AVL trees having between
211 * 5,704,880 and 4,294,967,295 nodes, depending on order of insertion.  You may
212 * change this compile-time constant as you wish.
213 */
214#ifndef RTEMS_BDBUF_AVL_MAX_HEIGHT
215#define RTEMS_BDBUF_AVL_MAX_HEIGHT (32)
216#endif
217
218/**
219 * Searches for the node with specified dev/block.
220 *
221 * @param root pointer to the root node of the AVL-Tree
222 * @param dev device search key
223 * @param block block search key
224 * @retval NULL node with the specified dev/block is not found
225 * @return pointer to the node with specified dev/block
226 */
227static rtems_bdbuf_buffer *
228rtems_bdbuf_avl_search (rtems_bdbuf_buffer** root,
229                        dev_t                dev,
230                        rtems_blkdev_bnum    block)
231{
232  rtems_bdbuf_buffer* p = *root;
233
234  while ((p != NULL) && ((p->dev != dev) || (p->block != block)))
235  {
236    if ((p->dev < dev) || ((p->dev == dev) && (p->block < block)))
237    {
238      p = p->avl.right;
239    }
240    else
241    {
242      p = p->avl.left;
243    }
244  }
245
246  return p;
247}
248
249/**
250 * Inserts the specified node to the AVl-Tree.
251 *
252 * @param root pointer to the root node of the AVL-Tree
253 * @param node Pointer to the node to add.
254 * @retval 0 The node added successfully
255 * @retval -1 An error occured
256 */
257static int
258rtems_bdbuf_avl_insert(rtems_bdbuf_buffer** root,
259                       rtems_bdbuf_buffer*  node)
260{
261  dev_t             dev = node->dev;
262  rtems_blkdev_bnum block = node->block;
263
264  rtems_bdbuf_buffer*  p = *root;
265  rtems_bdbuf_buffer*  q;
266  rtems_bdbuf_buffer*  p1;
267  rtems_bdbuf_buffer*  p2;
268  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
269  rtems_bdbuf_buffer** buf_prev = buf_stack;
270
271  boolean modified = FALSE;
272
273  if (p == NULL)
274  {
275    *root = node;
276    node->avl.left = NULL;
277    node->avl.right = NULL;
278    node->avl.bal = 0;
279    return 0;
280  }
281
282  while (p != NULL)
283  {
284    *buf_prev++ = p;
285
286    if ((p->dev < dev) || ((p->dev == dev) && (p->block < block)))
287    {
288      p->avl.cache = 1;
289      q = p->avl.right;
290      if (q == NULL)
291      {
292        q = node;
293        p->avl.right = q = node;
294        break;
295      }
296    }
297    else if ((p->dev != dev) || (p->block != block))
298    {
299      p->avl.cache = -1;
300      q = p->avl.left;
301      if (q == NULL)
302      {
303        q = node;
304        p->avl.left = q;
305        break;
306      }
307    }
308    else
309    {
310      return -1;
311    }
312
313    p = q;
314  }
315
316  q->avl.left = q->avl.right = NULL;
317  q->avl.bal = 0;
318  modified = TRUE;
319  buf_prev--;
320
321  while (modified)
322  {
323    if (p->avl.cache == -1)
324    {
325      switch (p->avl.bal)
326      {
327        case 1:
328          p->avl.bal = 0;
329          modified = FALSE;
330          break;
331
332        case 0:
333          p->avl.bal = -1;
334          break;
335
336        case -1:
337          p1 = p->avl.left;
338          if (p1->avl.bal == -1) /* simple LL-turn */
339          {
340            p->avl.left = p1->avl.right;
341            p1->avl.right = p;
342            p->avl.bal = 0;
343            p = p1;
344          }
345          else /* double LR-turn */
346          {
347            p2 = p1->avl.right;
348            p1->avl.right = p2->avl.left;
349            p2->avl.left = p1;
350            p->avl.left = p2->avl.right;
351            p2->avl.right = p;
352            if (p2->avl.bal == -1) p->avl.bal = +1; else p->avl.bal = 0;
353            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
354            p = p2;
355          }
356          p->avl.bal = 0;
357          modified = FALSE;
358          break;
359
360        default:
361          break;
362      }
363    }
364    else
365    {
366      switch (p->avl.bal)
367      {
368        case -1:
369          p->avl.bal = 0;
370          modified = FALSE;
371          break;
372
373        case 0:
374          p->avl.bal = 1;
375          break;
376
377        case 1:
378          p1 = p->avl.right;
379          if (p1->avl.bal == 1) /* simple RR-turn */
380          {
381            p->avl.right = p1->avl.left;
382            p1->avl.left = p;
383            p->avl.bal = 0;
384            p = p1;
385          }
386          else /* double RL-turn */
387          {
388            p2 = p1->avl.left;
389            p1->avl.left = p2->avl.right;
390            p2->avl.right = p1;
391            p->avl.right = p2->avl.left;
392            p2->avl.left = p;
393            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
394            if (p2->avl.bal == -1) p1->avl.bal = +1; else p1->avl.bal = 0;
395            p = p2;
396          }
397          p->avl.bal = 0;
398          modified = FALSE;
399          break;
400
401        default:
402          break;
403      }
404    }
405    q = p;
406    if (buf_prev > buf_stack)
407    {
408      p = *--buf_prev;
409
410      if (p->avl.cache == -1)
411      {
412        p->avl.left = q;
413      }
414      else
415      {
416        p->avl.right = q;
417      }
418    }
419    else
420    {
421      *root = p;
422      break;
423    }
424  };
425
426  return 0;
427}
428
429
430/**
431 * Removes the node from the tree.
432 *
433 * @param root_addr Pointer to pointer to the root node
434 * @param node Pointer to the node to remove
435 * @retval 0 Item removed
436 * @retval -1 No such item found
437 */
438static int
439rtems_bdbuf_avl_remove(rtems_bdbuf_buffer**      root,
440                       const rtems_bdbuf_buffer* node)
441{
442  dev_t             dev = node->dev;
443  rtems_blkdev_bnum block = node->block;
444
445  rtems_bdbuf_buffer*  p = *root;
446  rtems_bdbuf_buffer*  q;
447  rtems_bdbuf_buffer*  r;
448  rtems_bdbuf_buffer*  s;
449  rtems_bdbuf_buffer*  p1;
450  rtems_bdbuf_buffer*  p2;
451  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
452  rtems_bdbuf_buffer** buf_prev = buf_stack;
453
454  boolean modified = FALSE;
455
456  memset (buf_stack, 0, sizeof(buf_stack));
457
458  while (p != NULL)
459  {
460    *buf_prev++ = p;
461
462    if ((p->dev < dev) || ((p->dev == dev) && (p->block < block)))
463    {
464      p->avl.cache = 1;
465      p = p->avl.right;
466    }
467    else if ((p->dev != dev) || (p->block != block))
468    {
469      p->avl.cache = -1;
470      p = p->avl.left;
471    }
472    else
473    {
474      /* node found */
475      break;
476    }
477  }
478
479  if (p == NULL)
480  {
481    /* there is no such node */
482    return -1;
483  }
484
485  q = p;
486
487  buf_prev--;
488  if (buf_prev > buf_stack)
489  {
490    p = *(buf_prev - 1);
491  }
492  else
493  {
494    p = NULL;
495  }
496
497  /* at this moment q - is a node to delete, p is q's parent */
498  if (q->avl.right == NULL)
499  {
500    r = q->avl.left;
501    if (r != NULL)
502    {
503      r->avl.bal = 0;
504    }
505    q = r;
506  }
507  else
508  {
509    rtems_bdbuf_buffer **t;
510
511    r = q->avl.right;
512
513    if (r->avl.left == NULL)
514    {
515      r->avl.left = q->avl.left;
516      r->avl.bal = q->avl.bal;
517      r->avl.cache = 1;
518      *buf_prev++ = q = r;
519    }
520    else
521    {
522      t = buf_prev++;
523      s = r;
524
525      while (s->avl.left != NULL)
526      {
527        *buf_prev++ = r = s;
528        s = r->avl.left;
529        r->avl.cache = -1;
530      }
531
532      s->avl.left = q->avl.left;
533      r->avl.left = s->avl.right;
534      s->avl.right = q->avl.right;
535      s->avl.bal = q->avl.bal;
536      s->avl.cache = 1;
537
538      *t = q = s;
539    }
540  }
541
542  if (p != NULL)
543  {
544    if (p->avl.cache == -1)
545    {
546      p->avl.left = q;
547    }
548    else
549    {
550      p->avl.right = q;
551    }
552  }
553  else
554  {
555    *root = q;
556  }
557
558  modified = TRUE;
559
560  while (modified)
561  {
562    if (buf_prev > buf_stack)
563    {
564      p = *--buf_prev;
565    }
566    else
567    {
568      break;
569    }
570
571    if (p->avl.cache == -1)
572    {
573      /* rebalance left branch */
574      switch (p->avl.bal)
575      {
576        case -1:
577          p->avl.bal = 0;
578          break;
579        case  0:
580          p->avl.bal = 1;
581          modified = FALSE;
582          break;
583
584        case +1:
585          p1 = p->avl.right;
586
587          if (p1->avl.bal >= 0) /* simple RR-turn */
588          {
589            p->avl.right = p1->avl.left;
590            p1->avl.left = p;
591
592            if (p1->avl.bal == 0)
593            {
594              p1->avl.bal = -1;
595              modified = FALSE;
596            }
597            else
598            {
599              p->avl.bal = 0;
600              p1->avl.bal = 0;
601            }
602            p = p1;
603          }
604          else /* double RL-turn */
605          {
606            p2 = p1->avl.left;
607
608            p1->avl.left = p2->avl.right;
609            p2->avl.right = p1;
610            p->avl.right = p2->avl.left;
611            p2->avl.left = p;
612
613            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
614            if (p2->avl.bal == -1) p1->avl.bal = 1; else p1->avl.bal = 0;
615
616            p = p2;
617            p2->avl.bal = 0;
618          }
619          break;
620
621        default:
622          break;
623      }
624    }
625    else
626    {
627      /* rebalance right branch */
628      switch (p->avl.bal)
629      {
630        case +1:
631          p->avl.bal = 0;
632          break;
633
634        case  0:
635          p->avl.bal = -1;
636          modified = FALSE;
637          break;
638
639        case -1:
640          p1 = p->avl.left;
641
642          if (p1->avl.bal <= 0) /* simple LL-turn */
643          {
644            p->avl.left = p1->avl.right;
645            p1->avl.right = p;
646            if (p1->avl.bal == 0)
647            {
648              p1->avl.bal = 1;
649              modified = FALSE;
650            }
651            else
652            {
653              p->avl.bal = 0;
654              p1->avl.bal = 0;
655            }
656            p = p1;
657          }
658          else /* double LR-turn */
659          {
660            p2 = p1->avl.right;
661
662            p1->avl.right = p2->avl.left;
663            p2->avl.left = p1;
664            p->avl.left = p2->avl.right;
665            p2->avl.right = p;
666
667            if (p2->avl.bal == -1) p->avl.bal = 1; else p->avl.bal = 0;
668            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
669
670            p = p2;
671            p2->avl.bal = 0;
672          }
673          break;
674
675        default:
676          break;
677      }
678    }
679
680    if (buf_prev > buf_stack)
681    {
682      q = *(buf_prev - 1);
683
684      if (q->avl.cache == -1)
685      {
686        q->avl.left = p;
687      }
688      else
689      {
690        q->avl.right = p;
691      }
692    }
693    else
694    {
695      *root = p;
696      break;
697    }
698
699  }
700
701  return 0;
702}
703
704/**
705 * Get the pool for the device.
706 *
707 * @param pdd Physical disk device.
708 */
709static rtems_bdbuf_pool*
710rtems_bdbuf_get_pool (const rtems_bdpool_id pid)
711{
712  return &rtems_bdbuf_ctx.pool[pid];
713}
714
715/**
716 * Lock the pool. A single task can nest calls.
717 *
718 * @param pool The pool to lock.
719 */
720static void
721rtems_bdbuf_lock_pool (rtems_bdbuf_pool* pool)
722{
723  rtems_status_code sc = rtems_semaphore_obtain (pool->lock,
724                                                 RTEMS_WAIT,
725                                                 RTEMS_NO_TIMEOUT);
726  if (sc != RTEMS_SUCCESSFUL)
727    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_LOCK);
728}
729
730/**
731 * Unlock the pool.
732 *
733 * @param pool The pool to unlock.
734 */
735static void
736rtems_bdbuf_unlock_pool (rtems_bdbuf_pool* pool)
737{
738  rtems_status_code sc = rtems_semaphore_release (pool->lock);
739  if (sc != RTEMS_SUCCESSFUL)
740    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_UNLOCK);
741}
742
743/**
744 * Lock the pool's sync. A single task can nest calls.
745 *
746 * @param pool The pool's sync to lock.
747 */
748static void
749rtems_bdbuf_lock_sync (rtems_bdbuf_pool* pool)
750{
751  rtems_status_code sc = rtems_semaphore_obtain (pool->sync_lock,
752                                                 RTEMS_WAIT,
753                                                 RTEMS_NO_TIMEOUT);
754  if (sc != RTEMS_SUCCESSFUL)
755    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SYNC_LOCK);
756}
757
758/**
759 * Unlock the pool's sync.
760 *
761 * @param pool The pool's sync to unlock.
762 */
763static void
764rtems_bdbuf_unlock_sync (rtems_bdbuf_pool* pool)
765{
766  rtems_status_code sc = rtems_semaphore_release (pool->sync_lock);
767  if (sc != RTEMS_SUCCESSFUL)
768    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SYNC_UNLOCK);
769}
770
771/**
772 * Wait until woken. Semaphores are used so a number of tasks can wait and can
773 * be woken at once. Task events would require we maintain a list of tasks to
774 * be woken and this would require storgage and we do not know the number of
775 * tasks that could be waiting.
776 *
777 * While we have the pool locked we can try and claim the semaphore and
778 * therefore know when we release the lock to the pool we will block until the
779 * semaphore is released. This may even happen before we get to block.
780 *
781 * A counter is used to save the release call when no one is waiting.
782 *
783 * The function assumes the pool is locked on entry and it will be locked on
784 * exit.
785 *
786 * @param pool The pool to wait for a buffer to return.
787 * @param sema The semaphore to block on and wait.
788 * @param waiters The wait counter for this semaphore.
789 */
790static void
791rtems_bdbuf_wait (rtems_bdbuf_pool* pool, rtems_id* sema,
792                  volatile uint32_t* waiters)
793{
794  rtems_status_code sc;
795  rtems_mode        prev_mode;
796 
797  /*
798   * Indicate we are waiting.
799   */
800  *waiters += 1;
801
802  /*
803   * Disable preemption then unlock the pool and block.
804   * There is no POSIX condition variable in the core API so
805   * this is a work around.
806   *
807   * The issue is a task could preempt after the pool is unlocked
808   * because it is blocking or just hits that window, and before
809   * this task has blocked on the semaphore. If the preempting task
810   * flushes the queue this task will not see the flush and may
811   * block for ever or until another transaction flushes this
812   * semaphore.
813   */
814  sc = rtems_task_mode (RTEMS_NO_PREEMPT, RTEMS_PREEMPT_MASK, &prev_mode);
815
816  if (sc != RTEMS_SUCCESSFUL)
817    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAIT);
818 
819  /*
820   * Unlock the pool, wait, and lock the pool when we return.
821   */
822  rtems_bdbuf_unlock_pool (pool);
823
824  sc = rtems_semaphore_obtain (*sema, RTEMS_WAIT, RTEMS_NO_TIMEOUT);
825 
826  if (sc != RTEMS_UNSATISFIED)
827    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAIT);
828 
829  rtems_bdbuf_lock_pool (pool);
830
831  sc = rtems_task_mode (prev_mode, RTEMS_ALL_MODE_MASKS, &prev_mode);
832
833  if (sc != RTEMS_SUCCESSFUL)
834    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAIT);
835 
836  *waiters -= 1;
837}
838
839/**
840 * Wake a blocked resource. The resource has a counter that lets us know if
841 * there are any waiters.
842 *
843 * @param sema The semaphore to release.
844 * @param waiters The wait counter for this semaphore.
845 */
846static void
847rtems_bdbuf_wake (rtems_id sema, volatile uint32_t* waiters)
848{
849  if (*waiters)
850  {
851    rtems_status_code sc;
852
853    sc = rtems_semaphore_flush (sema);
854 
855    if (sc != RTEMS_SUCCESSFUL)
856      rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_POOL_WAKE);
857  }
858}
859
860/**
861 * Add a buffer descriptor to the modified list. This modified list is treated
862 * a litte differently to the other lists. To access it you must have the pool
863 * locked and this is assumed to be the case on entry to this call.
864 *
865 * If the pool has a device being sync'ed and the bd is for that device the
866 * call must block and wait until the sync is over before adding the bd to the
867 * modified list. Once a sync happens for a device no bd's can be added the
868 * modified list. The disk image is forced to be snapshot at that moment in
869 * time.
870 *
871 * and you must
872 * hold the sync lock. The sync lock is used to block writes while a sync is
873 * active.
874 *
875 * @param pool The pool the bd belongs to.
876 * @param bd The bd to queue to the pool's modified list.
877 */
878static void
879rtems_bdbuf_append_modified (rtems_bdbuf_pool* pool, rtems_bdbuf_buffer* bd)
880{
881  /*
882   * If the pool has a device being sync'ed check if this bd is for that
883   * device. If it is unlock the pool and block on the sync lock. once we have
884   * the sync lock reelase it.
885   *
886   * If the
887   */
888  if (pool->sync_active && (pool->sync_device == bd->dev))
889  {
890    rtems_bdbuf_unlock_pool (pool);
891    rtems_bdbuf_lock_sync (pool);
892    rtems_bdbuf_unlock_sync (pool);
893    rtems_bdbuf_lock_pool (pool);
894  }
895     
896  bd->state = RTEMS_BDBUF_STATE_MODIFIED;
897
898  rtems_chain_append (&pool->modified, &bd->link);
899}
900
901/**
902 * Wait the swapper task.
903 */
904static void
905rtems_bdbuf_wake_swapper (void)
906{
907  rtems_status_code sc = rtems_event_send (rtems_bdbuf_ctx.swapout,
908                                           RTEMS_BDBUF_SWAPOUT_SYNC);
909  if (sc != RTEMS_SUCCESSFUL)
910    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_WAKE);
911}
912
913/**
914 * Initialize single buffer pool.
915 *
916 * @param config Buffer pool configuration
917 * @param pid Pool number
918 *
919 * @return RTEMS_SUCCESSFUL, if buffer pool initialized successfully, or error
920 *         code if error occured.
921 */
922static rtems_status_code
923rtems_bdbuf_initialize_pool (rtems_bdbuf_pool_config* config,
924                             rtems_bdpool_id          pid)
925{
926  unsigned char*      buffer = config->mem_area;
927  rtems_bdbuf_pool*   pool;
928  rtems_bdbuf_buffer* bd;
929  rtems_status_code   sc;
930  uint32_t            b;
931
932  pool = rtems_bdbuf_get_pool (pid);
933 
934  pool->blksize        = config->size;
935  pool->nblks          = config->num;
936  pool->flags          = 0;
937  pool->sync_active    = FALSE;
938  pool->sync_device    = -1;
939  pool->sync_requester = 0;
940  pool->tree           = NULL;
941  pool->buffers        = NULL;
942
943  rtems_chain_initialize_empty (&pool->ready);
944  rtems_chain_initialize_empty (&pool->lru);
945  rtems_chain_initialize_empty (&pool->modified);
946  rtems_chain_initialize_empty (&pool->sync);
947
948  pool->access           = 0;
949  pool->access_waiters   = 0;
950  pool->transfer         = 0;
951  pool->transfer_waiters = 0;
952  pool->waiting          = 0;
953  pool->wait_waiters     = 0;
954 
955  /*
956   * Allocate memory for buffer descriptors
957   */
958  pool->bds = calloc (config->num, sizeof (rtems_bdbuf_buffer));
959 
960  if (!pool->bds)
961    return RTEMS_NO_MEMORY;
962
963  /*
964   * Allocate memory for buffers if required.
965   */
966  if (buffer == NULL)
967  {
968    buffer = pool->buffers = malloc (config->num * config->size);
969    if (!pool->buffers)
970    {
971      free (pool->bds);
972      return RTEMS_NO_MEMORY;
973    }
974  }
975
976  for (b = 0, bd = pool->bds;
977       b < pool->nblks;
978       b++, bd++, buffer += pool->blksize)
979  {
980    bd->dev        = -1;
981    bd->block      = 0;
982    bd->buffer     = buffer;
983    bd->avl.left   = NULL;
984    bd->avl.right  = NULL;
985    bd->state      = RTEMS_BDBUF_STATE_EMPTY;
986    bd->pool       = pid;
987    bd->error      = 0;
988    bd->waiters    = 0;
989    bd->hold_timer = 0;
990   
991    rtems_chain_append (&pool->ready, &bd->link);
992  }
993
994  sc = rtems_semaphore_create (rtems_build_name ('B', 'P', '0' + pid, 'L'),
995                               1, RTEMS_BDBUF_POOL_LOCK_ATTRIBS, 0,
996                               &pool->lock);
997  if (sc != RTEMS_SUCCESSFUL)
998  {
999    free (pool->buffers);
1000    free (pool->bds);
1001    return sc;
1002  }
1003
1004  sc = rtems_semaphore_create (rtems_build_name ('B', 'P', '0' + pid, 'S'),
1005                               1, RTEMS_BDBUF_POOL_LOCK_ATTRIBS, 0,
1006                               &pool->sync_lock);
1007  if (sc != RTEMS_SUCCESSFUL)
1008  {
1009    rtems_semaphore_delete (pool->lock);
1010    free (pool->buffers);
1011    free (pool->bds);
1012    return sc;
1013  }
1014 
1015  sc = rtems_semaphore_create (rtems_build_name ('B', 'P', '0' + pid, 'a'),
1016                               0, RTEMS_BDBUF_POOL_WAITER_ATTRIBS, 0,
1017                               &pool->access);
1018  if (sc != RTEMS_SUCCESSFUL)
1019  {
1020    rtems_semaphore_delete (pool->sync_lock);
1021    rtems_semaphore_delete (pool->lock);
1022    free (pool->buffers);
1023    free (pool->bds);
1024    return sc;
1025  }
1026
1027  sc = rtems_semaphore_create (rtems_build_name ('B', 'P', '0' + pid, 't'),
1028                               0, RTEMS_BDBUF_POOL_WAITER_ATTRIBS, 0,
1029                               &pool->transfer);
1030  if (sc != RTEMS_SUCCESSFUL)
1031  {
1032    rtems_semaphore_delete (pool->access);
1033    rtems_semaphore_delete (pool->sync_lock);
1034    rtems_semaphore_delete (pool->lock);
1035    free (pool->buffers);
1036    free (pool->bds);
1037    return sc;
1038  }
1039
1040  sc = rtems_semaphore_create (rtems_build_name ('B', 'P', '0' + pid, 'w'),
1041                               0, RTEMS_BDBUF_POOL_WAITER_ATTRIBS, 0,
1042                               &pool->waiting);
1043  if (sc != RTEMS_SUCCESSFUL)
1044  {
1045    rtems_semaphore_delete (pool->transfer);
1046    rtems_semaphore_delete (pool->access);
1047    rtems_semaphore_delete (pool->sync_lock);
1048    rtems_semaphore_delete (pool->lock);
1049    free (pool->buffers);
1050    free (pool->bds);
1051    return sc;
1052  }
1053
1054  return RTEMS_SUCCESSFUL;
1055}
1056
1057/**
1058 * Free resources allocated for buffer pool with specified number.
1059 *
1060 * @param pid Buffer pool number
1061 *
1062 * @retval RTEMS_SUCCESSFUL
1063 */
1064static rtems_status_code
1065rtems_bdbuf_release_pool (rtems_bdpool_id pid)
1066{
1067  rtems_bdbuf_pool* pool = rtems_bdbuf_get_pool (pid);
1068 
1069  rtems_bdbuf_lock_pool (pool);
1070
1071  rtems_semaphore_delete (pool->waiting);
1072  rtems_semaphore_delete (pool->transfer);
1073  rtems_semaphore_delete (pool->access);
1074  rtems_semaphore_delete (pool->lock);
1075 
1076  free (pool->buffers);
1077  free (pool->bds);
1078 
1079  return RTEMS_SUCCESSFUL;
1080}
1081
1082/**
1083 * Prepare buffering layer to work - initialize buffer descritors and (if it is
1084 * neccessary) buffers. Buffers will be allocated accoriding to the
1085 * configuration table, each entry describes the size of block and the size of
1086 * the pool. After initialization all blocks is placed into the ready state.
1087 * lists.
1088 *
1089 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1090 *         successfully or error code if error is occured)
1091 */
1092rtems_status_code
1093rtems_bdbuf_init (void)
1094{
1095  rtems_bdpool_id   p;
1096  rtems_status_code sc;
1097
1098#if RTEMS_BDBUF_TRACE
1099  rtems_bdbuf_printf ("init\n");
1100#endif
1101
1102  if (rtems_bdbuf_pool_configuration_size <= 0)
1103    return RTEMS_INVALID_SIZE;
1104
1105  if (rtems_bdbuf_ctx.npools)
1106    return RTEMS_RESOURCE_IN_USE;
1107
1108  rtems_bdbuf_ctx.npools = rtems_bdbuf_pool_configuration_size;
1109
1110  /*
1111   * Allocate memory for buffer pool descriptors
1112   */
1113  rtems_bdbuf_ctx.pool = calloc (rtems_bdbuf_pool_configuration_size,
1114                                 sizeof (rtems_bdbuf_pool));
1115 
1116  if (rtems_bdbuf_ctx.pool == NULL)
1117    return RTEMS_NO_MEMORY;
1118
1119  /*
1120   * Initialize buffer pools and roll out if something failed,
1121   */
1122  for (p = 0; p < rtems_bdbuf_ctx.npools; p++)
1123  {
1124    sc = rtems_bdbuf_initialize_pool (&rtems_bdbuf_pool_configuration[p], p);
1125    if (sc != RTEMS_SUCCESSFUL)
1126    {
1127      rtems_bdpool_id j;
1128      for (j = 0; j < p - 1; j++)
1129        rtems_bdbuf_release_pool (j);
1130      return sc;
1131    }
1132  }
1133
1134  /*
1135   * Create and start swapout task
1136   */
1137
1138  rtems_bdbuf_ctx.swapout_enabled = TRUE;
1139 
1140  sc = rtems_task_create (rtems_build_name('B', 'S', 'W', 'P'),
1141                          (rtems_bdbuf_configuration.swapout_priority ?
1142                           rtems_bdbuf_configuration.swapout_priority :
1143                           RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT),
1144                          SWAPOUT_TASK_STACK_SIZE,
1145                          RTEMS_PREEMPT | RTEMS_NO_TIMESLICE | RTEMS_NO_ASR,
1146                          RTEMS_LOCAL | RTEMS_NO_FLOATING_POINT,
1147                          &rtems_bdbuf_ctx.swapout);
1148  if (sc != RTEMS_SUCCESSFUL)
1149  {
1150    for (p = 0; p < rtems_bdbuf_ctx.npools; p++)
1151      rtems_bdbuf_release_pool (p);
1152    free (rtems_bdbuf_ctx.pool);
1153    return sc;
1154  }
1155
1156  sc = rtems_task_start (rtems_bdbuf_ctx.swapout,
1157                         rtems_bdbuf_swapout_task,
1158                         (rtems_task_argument) &rtems_bdbuf_ctx);
1159  if (sc != RTEMS_SUCCESSFUL)
1160  {
1161    rtems_task_delete (rtems_bdbuf_ctx.swapout);
1162    for (p = 0; p < rtems_bdbuf_ctx.npools; p++)
1163      rtems_bdbuf_release_pool (p);
1164    free (rtems_bdbuf_ctx.pool);
1165    return sc;
1166  }
1167
1168  return RTEMS_SUCCESSFUL;
1169}
1170
1171/**
1172 * Get a buffer for this device and block. This function returns a buffer once
1173 * placed into the AVL tree. If no buffer is available and it is not a read
1174 * ahead request and no buffers are waiting to the written to disk wait until
1175 * one is available. If buffers are waiting to be written to disk and non are
1176 * available expire the hold timer and wake the swap out task. If the buffer is
1177 * for a read ahead transfer return NULL if there is not buffer or it is in the
1178 * cache.
1179 *
1180 * The AVL tree of buffers for the pool is searched and if not located check
1181 * obtain a buffer and insert it into the AVL tree. Buffers are first obtained
1182 * from the ready list until all empty/ready buffers are used. Once all buffers
1183 * are in use buffers are taken from the LRU list with the least recently used
1184 * buffer taken first. A buffer taken from the LRU list is removed from the AVL
1185 * tree. The ready list or LRU list buffer is initialised to this device and
1186 * block. If no buffers are available due to the ready and LRU lists being
1187 * empty a check is made of the modified list. Buffers may be queued waiting
1188 * for the hold timer to expire. These buffers should be written to disk and
1189 * returned to the LRU list where they can be used rather than this call
1190 * blocking. If buffers are on the modified list the max. write block size of
1191 * buffers have their hold timer expired and the swap out task woken. The
1192 * caller then blocks on the waiting semaphore and counter. When buffers return
1193 * from the upper layers (access) or lower driver (transfer) the blocked caller
1194 * task is woken and this procedure is repeated. The repeat handles a case of a
1195 * another thread pre-empting getting a buffer first and adding it to the AVL
1196 * tree.
1197 *
1198 * A buffer located in the AVL tree means it is already in the cache and maybe
1199 * in use somewhere. The buffer can be either:
1200 *
1201 * # Cached. Not being accessed or part of a media transfer.
1202 * # Access or modifed access. Is with an upper layer being accessed.
1203 * # Transfer. Is with the driver and part of a media transfer.
1204 *
1205 * If cached we assign the new state, extract it from any list it maybe part of
1206 * and return to the user.
1207 *
1208 * This function assumes the pool the buffer is being taken from is locked and
1209 * it will make sure the pool is locked when it returns. The pool will be
1210 * unlocked if the call could block.
1211 *
1212 * @param device The physical disk device
1213 * @param pool The pool reference
1214 * @param block Absolute media block number
1215 * @param read_ahead The get is for a read ahead buffer
1216 *
1217 * @return RTEMS status code ( if operation completed successfully or error
1218 *         code if error is occured)
1219 */
1220static rtems_bdbuf_buffer*
1221rtems_bdbuf_get_buffer (rtems_disk_device* pdd,
1222                        rtems_bdbuf_pool*  pool,
1223                        rtems_blkdev_bnum  block,
1224                        boolean            read_ahead)
1225{
1226  dev_t               device = pdd->dev;
1227  rtems_bdbuf_buffer* bd;
1228  boolean             available;
1229
1230  /*
1231   * Loop until we get a buffer. Under load we could find no buffers are
1232   * available requiring this task to wait until some become available before
1233   * proceeding. There is no timeout. If the call is to block and the buffer is
1234   * for a read ahead buffer return NULL.
1235   *
1236   * The search procedure is repeated as another thread could have pre-empted
1237   * us while we waited for a buffer, obtained an empty buffer and loaded the
1238   * AVL tree with the one we are after.
1239   */
1240  do
1241  {
1242    /*
1243     * Search for buffer descriptor for this dev/block key.
1244     */
1245    bd = rtems_bdbuf_avl_search (&pool->tree, device, block);
1246
1247    /*
1248     * No buffer in the cache for this block. We need to obtain a buffer and
1249     * this means take a buffer that is ready to use. If all buffers are in use
1250     * take the least recently used buffer. If there are none then the cache is
1251     * empty. All the buffers are either queued to be written to disk or with
1252     * the user. We cannot do much with the buffers with the user how-ever with
1253     * the modified buffers waiting to be written to disk flush the maximum
1254     * number transfered in a block to disk. After this all that can be done is
1255     * to wait for a buffer to return to the cache.
1256     */
1257    if (!bd)
1258    {
1259      /*
1260       * Assign new buffer descriptor from the empty list if one is present. If
1261       * the empty queue is empty get the oldest buffer from LRU list. If the
1262       * LRU list is empty there are no available buffers check the modified
1263       * list.
1264       */
1265      if (rtems_chain_is_empty (&pool->ready))
1266      {
1267        /*
1268         * No unsed or read-ahead buffers.
1269         *
1270         * If this is a read ahead buffer just return. No need to place further
1271         * pressure on the cache by reading something that may be needed when
1272         * we have data in the cache that was needed and may still be.
1273         */
1274        if (read_ahead)
1275          return NULL;
1276
1277        /*
1278         * Check the LRU list.
1279         */
1280        bd = (rtems_bdbuf_buffer *) rtems_chain_get (&pool->lru);
1281       
1282        if (bd)
1283        {
1284          /*
1285           * Remove the buffer from the AVL tree.
1286           */
1287          if (rtems_bdbuf_avl_remove (&pool->tree, bd) != 0)
1288            rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1289        }
1290        else
1291        {
1292          /*
1293           * If there are buffers on the modified list expire the hold timer
1294           * and wake the swap out task then wait else just go and wait.
1295           */
1296          if (!rtems_chain_is_empty (&pool->modified))
1297          {
1298            rtems_chain_node* node = rtems_chain_head (&pool->modified);
1299            uint32_t          write_blocks = 0;
1300           
1301            node = node->next;
1302            while ((write_blocks < rtems_bdbuf_configuration.max_write_blocks) &&
1303                   !rtems_chain_is_tail (&pool->modified, node))
1304            {
1305              rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
1306              bd->hold_timer = 0;
1307              write_blocks++;
1308              node = node->next;
1309            }
1310
1311            rtems_bdbuf_wake_swapper ();
1312          }
1313         
1314          /*
1315           * Wait for a buffer to be returned to the pool. The buffer will be
1316           * placed on the LRU list.
1317           */
1318          rtems_bdbuf_wait (pool, &pool->waiting, &pool->wait_waiters);
1319        }
1320      }
1321      else
1322      {
1323        bd = (rtems_bdbuf_buffer *) rtems_chain_get (&(pool->ready));
1324
1325        if ((bd->state != RTEMS_BDBUF_STATE_EMPTY) &&
1326            (bd->state != RTEMS_BDBUF_STATE_READ_AHEAD))
1327          rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1328
1329        if (bd->state == RTEMS_BDBUF_STATE_READ_AHEAD)
1330        {
1331          if (rtems_bdbuf_avl_remove (&pool->tree, bd) != 0)
1332            rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1333        }
1334      }
1335
1336      if (bd)
1337      {
1338        bd->dev       = device;
1339        bd->block     = block;
1340        bd->avl.left  = NULL;
1341        bd->avl.right = NULL;
1342        bd->state     = RTEMS_BDBUF_STATE_EMPTY;
1343        bd->error     = 0;
1344        bd->waiters   = 0;
1345
1346        if (rtems_bdbuf_avl_insert (&pool->tree, bd) != 0)
1347          rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1348
1349        return bd;
1350      }
1351    }
1352  }
1353  while (!bd);
1354
1355  /*
1356   * If the buffer is for read ahead and it exists in the AVL cache or is being
1357   * accessed or being transfered then return NULL.
1358   */
1359  if (read_ahead)
1360    return NULL;
1361
1362  /*
1363   * Loop waiting for the buffer to enter the cached state. If the buffer is in
1364   * the access or transfer state then wait until it is not.
1365   */
1366  available = FALSE;
1367  while (!available)
1368  {
1369    switch (bd->state)
1370    {
1371      case RTEMS_BDBUF_STATE_CACHED:
1372      case RTEMS_BDBUF_STATE_MODIFIED:
1373      case RTEMS_BDBUF_STATE_READ_AHEAD:
1374        available = TRUE;
1375        break;
1376
1377      case RTEMS_BDBUF_STATE_ACCESS:
1378      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1379        bd->waiters++;
1380        rtems_bdbuf_wait (pool, &pool->access, &pool->access_waiters);
1381        bd->waiters--;
1382        break;
1383
1384      case RTEMS_BDBUF_STATE_SYNC:
1385      case RTEMS_BDBUF_STATE_TRANSFER:
1386        bd->waiters++;
1387        rtems_bdbuf_wait (pool, &pool->transfer, &pool->transfer_waiters);
1388        bd->waiters--;
1389        break;
1390
1391      default:
1392        rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1393    }
1394  }
1395
1396  /*
1397   * Buffer is linked to the LRU, modifed, or sync lists. Remove it from there.
1398   */
1399  rtems_chain_extract (&bd->link);
1400
1401  return bd;
1402}
1403
1404/**
1405 * Get block buffer for data to be written into. The buffers is set to the
1406 * access or modifed access state. If the buffer is in the cache and modified
1407 * the state is access modified else the state is access. This buffer contents
1408 * are not initialised if the buffer is not already in the cache. If the block
1409 * is already resident in memory it is returned how-ever if not in memory the
1410 * buffer is not read from disk. This call is used when writing the whole block
1411 * on a disk rather than just changing a part of it. If there is no buffers
1412 * available this call will block. A buffer obtained with this call will not be
1413 * involved in a transfer request and will not be returned to another user
1414 * until released. If the buffer is already with a user when this call is made
1415 * the call is blocked until the buffer is returned. The highest priority
1416 * waiter will obtain the buffer first.
1417 *
1418 * The block number is the linear block number. This is relative to the start
1419 * of the partition on the media.
1420 *
1421 * @param device Device number (constructed of major and minor device number)
1422 * @param block  Linear media block number
1423 * @param bd     Reference to the buffer descriptor pointer.
1424 *
1425 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1426 *               successfully or error code if error is occured)
1427 */
1428rtems_status_code
1429rtems_bdbuf_get (dev_t                device,
1430                 rtems_blkdev_bnum    block,
1431                 rtems_bdbuf_buffer** bdp)
1432{
1433  rtems_disk_device*  dd;
1434  rtems_bdbuf_pool*   pool;
1435  rtems_bdbuf_buffer* bd;
1436
1437  /*
1438   * Do not hold the pool lock when obtaining the disk table.
1439   */
1440  dd = rtems_disk_obtain (device);
1441  if (dd == NULL)
1442    return RTEMS_INVALID_ID;
1443
1444  if (block >= dd->size)
1445  {
1446    rtems_disk_release (dd);
1447    return RTEMS_INVALID_NUMBER;
1448  }
1449
1450  block += dd->start;
1451
1452  pool = rtems_bdbuf_get_pool (dd->phys_dev->pool);
1453 
1454  rtems_bdbuf_lock_pool (pool);
1455
1456#if RTEMS_BDBUF_TRACE
1457  rtems_bdbuf_printf ("get: %d (dev = %08x)\n", block, device);
1458#endif
1459
1460  bd = rtems_bdbuf_get_buffer (dd->phys_dev, pool, block, FALSE);
1461
1462  if (bd->state == RTEMS_BDBUF_STATE_MODIFIED)
1463    bd->state = RTEMS_BDBUF_STATE_ACCESS_MODIFIED;
1464  else
1465    bd->state = RTEMS_BDBUF_STATE_ACCESS;
1466 
1467  rtems_bdbuf_unlock_pool (pool);
1468
1469  rtems_disk_release(dd);
1470
1471  *bdp = bd;
1472 
1473  return RTEMS_SUCCESSFUL;
1474}
1475
1476/**
1477 * Call back handler called by the low level driver when the transfer has
1478 * completed. This function may be invoked from interrupt handler.
1479 *
1480 * @param arg Arbitrary argument specified in block device request
1481 *            structure (in this case - pointer to the appropriate
1482 *            block device request structure).
1483 * @param status I/O completion status
1484 * @param error errno error code if status != RTEMS_SUCCESSFUL
1485 */
1486static void
1487rtems_bdbuf_read_done (void* arg, rtems_status_code status, int error)
1488{
1489  rtems_blkdev_request* req = (rtems_blkdev_request*) arg;
1490
1491  req->error = error;
1492  req->status = status;
1493
1494  rtems_event_send (req->io_task, RTEMS_BDBUF_TRANSFER_SYNC);
1495}
1496
1497/**
1498 * Get the block buffer and if not already in the cache read from the disk. If
1499 * specified block already cached return. The buffer is set to the access or
1500 * modifed access state. If the buffer is in the cache and modified the state
1501 * is access modified else the state is access. If block is already being read
1502 * from disk for being written to disk this call blocks. If the buffer is
1503 * waiting to be written it is removed from modified queue and returned to the
1504 * user. If the buffer is not in the cache a new buffer is obtained and the
1505 * data read from disk. The call may block until these operations complete. A
1506 * buffer obtained with this call will not be involved in a transfer request
1507 * and will not be returned to another user until released. If the buffer is
1508 * already with a user when this call is made the call is blocked until the
1509 * buffer is returned. The highest priority waiter will obtain the buffer
1510 * first.
1511 *
1512 * @note Read ahead always reads buffers in sequence. All multi-block reads
1513 *       read consecutive blocks.
1514 *
1515 * @param device Device number (constructed of major and minor device number)
1516 * @param block  Linear media block number
1517 * @param bd     Reference to the buffer descriptor pointer.
1518 *
1519 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1520 *               successfully or error code if error is occured)
1521 */
1522rtems_status_code
1523rtems_bdbuf_read (dev_t                device,
1524                  rtems_blkdev_bnum    block,
1525                  rtems_bdbuf_buffer** bdp)
1526{
1527  rtems_disk_device*    dd;
1528  rtems_bdbuf_pool*     pool;
1529  rtems_bdbuf_buffer*   bd = NULL;
1530  uint32_t              read_ahead_count;
1531  rtems_blkdev_request* req;
1532 
1533  /*
1534   * @todo This type of request structure is wrong and should be removed.
1535   */
1536#define bdbuf_alloc(size) __builtin_alloca (size)
1537
1538  req = bdbuf_alloc (sizeof (rtems_blkdev_request) +
1539                     (sizeof ( rtems_blkdev_sg_buffer) *
1540                      rtems_bdbuf_configuration.max_read_ahead_blocks));
1541
1542  /*
1543   * Do not hold the pool lock when obtaining the disk table.
1544   */
1545  dd = rtems_disk_obtain (device);
1546  if (dd == NULL)
1547    return RTEMS_INVALID_ID;
1548
1549  block += dd->start;
1550 
1551#if RTEMS_BDBUF_TRACE
1552  rtems_bdbuf_printf ("read: %d (dev = %08x)\n", block, device);
1553#endif
1554 
1555  if (block >= dd->size)
1556  {
1557    rtems_disk_release(dd);
1558    return RTEMS_INVALID_NUMBER;
1559  }
1560
1561  req->bufnum = 0;
1562
1563  /*
1564   * Read the block plus the required number of blocks ahead. The number of
1565   * blocks to read ahead is configured by the user and limited by the size of
1566   * the disk or reaching a read ahead block that is also cached.
1567   *
1568   * Limit the blocks read by the size of the disk.
1569   */
1570  if ((rtems_bdbuf_configuration.max_read_ahead_blocks + block) < dd->size)
1571    read_ahead_count = rtems_bdbuf_configuration.max_read_ahead_blocks;
1572  else
1573    read_ahead_count = dd->size - block;
1574
1575  pool = rtems_bdbuf_get_pool (dd->phys_dev->pool);
1576
1577  rtems_bdbuf_lock_pool (pool);
1578
1579  while (req->bufnum < read_ahead_count)
1580  {
1581    /*
1582     * Get the buffer for the requested block. If the block is cached then
1583     * return it. If it is not cached transfer the block from the disk media
1584     * into memory.
1585     *
1586     * We need to clean up any buffers allocated and not passed back to the
1587     * caller.
1588     */
1589    bd = rtems_bdbuf_get_buffer (dd->phys_dev, pool,
1590                                 block + req->bufnum,
1591                                 req->bufnum == 0 ? FALSE : TRUE);
1592
1593    /*
1594     * Read ahead buffer is in the cache or none available. Read what we
1595     * can.
1596     */
1597    if (!bd)
1598      break;
1599
1600    /*
1601     * Is the block we are interested in the cache ?
1602     */
1603    if ((bd->state == RTEMS_BDBUF_STATE_CACHED) ||
1604        (bd->state == RTEMS_BDBUF_STATE_MODIFIED))
1605      break;
1606
1607    bd->state = RTEMS_BDBUF_STATE_TRANSFER;
1608    bd->error = 0;
1609
1610    /*
1611     * @todo The use of these req blocks is not a great design. The req is a
1612     *       struct with a single 'bufs' declared in the req struct and the
1613     *       others are added in the outer level struct. This relies on the
1614     *       structs joining as a single array and that assumes the compiler
1615     *       packs the structs. Why not just place on a list ? The BD has a
1616     *       node that can be used.
1617     */
1618    req->bufs[req->bufnum].user   = bd;
1619    req->bufs[req->bufnum].block  = bd->block;
1620    req->bufs[req->bufnum].length = dd->block_size;
1621    req->bufs[req->bufnum].buffer = bd->buffer;
1622    req->bufnum++;
1623  }
1624
1625  /*
1626   * Transfer any requested buffers. If the request count is 0 we have found
1627   * the block in the cache so return it.
1628   */
1629  if (req->bufnum)
1630  {
1631    /*
1632     * Unlock the pool. We have the buffer for the block and it will be in the
1633     * access or transfer state. We may also have a number of read ahead blocks
1634     * if we need to transfer data. At this point any other threads can gain
1635     * access to the pool and if they are after any of the buffers we have they
1636     * will block and be woken when the buffer is returned to the pool.
1637     *
1638     * If a transfer is needed the I/O operation will occur with pre-emption
1639     * enabled and the pool unlocked. This is a change to the previous version
1640     * of the bdbuf code.
1641     */
1642    int      result;
1643    uint32_t b;
1644   
1645    rtems_bdbuf_unlock_pool (pool);
1646
1647    req->req = RTEMS_BLKDEV_REQ_READ;
1648    req->req_done = rtems_bdbuf_read_done;
1649    req->done_arg = req;
1650    req->io_task = rtems_task_self ();
1651    req->status = RTEMS_RESOURCE_IN_USE;
1652    req->error = 0;
1653 
1654    result = dd->ioctl (dd->phys_dev->dev, RTEMS_BLKIO_REQUEST, req);
1655
1656    /*
1657     * Inspection of the DOS FS code shows the result from this function is
1658     * handled and a buffer must be returned.
1659     */
1660    if (result < 0)
1661    {
1662      req->error = errno;
1663      req->status = RTEMS_IO_ERROR;
1664    }
1665    else
1666    {
1667      rtems_status_code sc;
1668      rtems_event_set   out;
1669      sc = rtems_event_receive (RTEMS_BDBUF_TRANSFER_SYNC,
1670                                RTEMS_EVENT_ALL | RTEMS_WAIT,
1671                                0, &out);
1672
1673      if (sc != RTEMS_SUCCESSFUL)
1674        rtems_fatal_error_occurred (BLKDEV_FATAL_BDBUF_SWAPOUT_RE);
1675    }
1676
1677    rtems_bdbuf_lock_pool (pool);
1678
1679    for (b = 1; b < req->bufnum; b++)
1680    {
1681      bd = req->bufs[b].user;
1682      bd->error = req->error;
1683      bd->state = RTEMS_BDBUF_STATE_READ_AHEAD;
1684      rtems_bdbuf_release (bd);
1685    }
1686
1687    bd = req->bufs[0].user;
1688  }
1689
1690  /*
1691   * The data for this block is cached in the buffer.
1692   */
1693  if (bd->state == RTEMS_BDBUF_STATE_MODIFIED)
1694    bd->state = RTEMS_BDBUF_STATE_ACCESS_MODIFIED;
1695  else
1696    bd->state = RTEMS_BDBUF_STATE_ACCESS;
1697
1698  rtems_bdbuf_unlock_pool (pool);
1699  rtems_disk_release (dd);
1700
1701  *bdp = bd;
1702
1703  return RTEMS_SUCCESSFUL;
1704}
1705
1706/**
1707 * Release the buffer obtained by a read call back to the cache. If the buffer
1708 * was obtained by a get call and was not already in the cache the release
1709 * modified call should be used. A buffer released with this call obtained by a
1710 * get call may not be in sync with the contents on disk. If the buffer was in
1711 * the cache and modified before this call it will be returned to the modified
1712 * queue. The buffers is returned to the end of the LRU list.
1713 *
1714 * @param bd Reference to the buffer descriptor.
1715 *
1716 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1717 *         successfully or error code if error is occured)
1718 */
1719rtems_status_code
1720rtems_bdbuf_release (rtems_bdbuf_buffer* bd)
1721{
1722  rtems_bdbuf_pool* pool;
1723
1724  if (bd == NULL)
1725    return RTEMS_INVALID_ADDRESS;
1726
1727  pool = rtems_bdbuf_get_pool (bd->pool);
1728
1729  rtems_bdbuf_lock_pool (pool);
1730
1731#if RTEMS_BDBUF_TRACE
1732  rtems_bdbuf_printf ("release: %d\n", bd->block);
1733#endif
1734 
1735  if (bd->state == RTEMS_BDBUF_STATE_ACCESS_MODIFIED)
1736  {
1737    rtems_bdbuf_append_modified (pool, bd);
1738  }
1739  else
1740  {
1741    /*
1742     * If this is a read ahead buffer place the ready queue. Buffers are taken
1743     * from here first. If we prepend then get from the queue the buffers
1744     * furthermost from the read buffer will be used.
1745     */
1746    if (bd->state == RTEMS_BDBUF_STATE_READ_AHEAD)
1747      rtems_chain_prepend (&pool->ready, &bd->link);
1748    else
1749    {
1750      bd->state = RTEMS_BDBUF_STATE_CACHED;
1751      rtems_chain_append (&pool->lru, &bd->link);
1752    }
1753  }
1754 
1755  /*
1756   * If there are threads waiting to access the buffer wake them. Wake any
1757   * waiters if this is the first buffer to placed back onto the queue.
1758   */
1759  if (bd->waiters)
1760    rtems_bdbuf_wake (pool->access, &pool->access_waiters);
1761  else
1762  {
1763    if (bd->state == RTEMS_BDBUF_STATE_READ_AHEAD)
1764    {
1765      if (rtems_chain_has_only_one_node (&pool->ready))
1766        rtems_bdbuf_wake (pool->waiting, &pool->wait_waiters);
1767    }
1768    else
1769    {
1770      if (rtems_chain_has_only_one_node (&pool->lru))
1771        rtems_bdbuf_wake (pool->waiting, &pool->wait_waiters);
1772    }
1773  }
1774 
1775  rtems_bdbuf_unlock_pool (pool);
1776
1777  return RTEMS_SUCCESSFUL;
1778}
1779
1780/**
1781 * Release the buffer allocated with a get or read call placing it on the
1782 * modidied list.  If the buffer was not released modified before the hold
1783 * timer is set to the configuration value. If the buffer had been released
1784 * modified before but not written to disk the hold timer is not updated. The
1785 * buffer will be written to disk when the hold timer has expired, there are
1786 * not more buffers available in the cache and a get or read buffer needs one
1787 * or a sync call has been made. If the buffer is obtained with a get or read
1788 * before the hold timer has expired the buffer will be returned to the user.
1789 *
1790 * @param bd Reference to the buffer descriptor.
1791 *
1792 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1793 *         successfully or error code if error is occured)
1794 */
1795rtems_status_code
1796rtems_bdbuf_release_modified (rtems_bdbuf_buffer* bd)
1797{
1798  rtems_bdbuf_pool* pool;
1799
1800  if (bd == NULL)
1801    return RTEMS_INVALID_ADDRESS;
1802
1803  pool = rtems_bdbuf_get_pool (bd->pool);
1804
1805  rtems_bdbuf_lock_pool (pool);
1806
1807#if RTEMS_BDBUF_TRACE
1808  rtems_bdbuf_printf ("release modified: %d\n", bd->block);
1809#endif
1810
1811  bd->hold_timer = rtems_bdbuf_configuration.swap_block_hold;
1812 
1813  rtems_bdbuf_append_modified (pool, bd);
1814
1815  if (bd->waiters)
1816    rtems_bdbuf_wake (pool->access, &pool->access_waiters);
1817 
1818  rtems_bdbuf_unlock_pool (pool);
1819
1820  return RTEMS_SUCCESSFUL;
1821}
1822
1823/**
1824 * Release the buffer as modified and wait until it has been synchronized with
1825 * the disk by writing it. This buffer will be the first to be transfer to disk
1826 * and other buffers may also be written if the maximum number of blocks in a
1827 * requests allows it.
1828 *
1829 * @note This code does not lock the sync mutex and stop additions to the
1830 *       modified queue.
1831 *
1832 * @param bd Reference to the buffer descriptor.
1833 *
1834 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1835 *         successfully or error code if error is occured)
1836 */
1837rtems_status_code
1838rtems_bdbuf_sync (rtems_bdbuf_buffer* bd)
1839{
1840  rtems_bdbuf_pool* pool;
1841  boolean           available;
1842
1843#if RTEMS_BDBUF_TRACE
1844  rtems_bdbuf_printf ("sync: %d\n", bd->block);
1845#endif
1846 
1847  if (bd == NULL)
1848    return RTEMS_INVALID_ADDRESS;
1849
1850  pool = rtems_bdbuf_get_pool (bd->pool);
1851
1852  rtems_bdbuf_lock_pool (pool);
1853
1854  bd->state = RTEMS_BDBUF_STATE_SYNC;
1855
1856  rtems_chain_append (&pool->sync, &bd->link);
1857
1858  rtems_bdbuf_wake_swapper ();
1859
1860  available = FALSE;
1861  while (!available)
1862  {
1863    switch (bd->state)
1864    {
1865      case RTEMS_BDBUF_STATE_CACHED:
1866      case RTEMS_BDBUF_STATE_READ_AHEAD:
1867      case RTEMS_BDBUF_STATE_MODIFIED:
1868      case RTEMS_BDBUF_STATE_ACCESS:
1869      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1870        available = TRUE;
1871        break;
1872
1873      case RTEMS_BDBUF_STATE_SYNC:
1874      case RTEMS_BDBUF_STATE_TRANSFER:
1875        bd->waiters++;
1876        rtems_bdbuf_wait (pool, &pool->transfer, &pool->transfer_waiters);
1877        bd->waiters--;
1878        break;
1879
1880      default:
1881        rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CONSISTENCY);
1882    }
1883  }
1884
1885  rtems_bdbuf_unlock_pool (pool);
1886 
1887  return RTEMS_SUCCESSFUL;
1888}
1889
1890/**
1891 * Synchronize all modified buffers for this device with the disk and wait
1892 * until the transfers have completed. The sync mutex for the pool is locked
1893 * stopping the addition of any further modifed buffers. It is only the
1894 * currently modified buffers that are written.
1895 *
1896 * @param dev Block device number
1897 *
1898 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
1899 *         successfully or error code if error is occured)
1900 */
1901rtems_status_code
1902rtems_bdbuf_syncdev (dev_t dev)
1903{
1904  rtems_disk_device*  dd;
1905  rtems_bdbuf_pool*   pool;
1906  rtems_status_code   sc;
1907  rtems_event_set     out;
1908
1909#if RTEMS_BDBUF_TRACE
1910  rtems_bdbuf_printf ("syncdev: %08x\n", dev);
1911#endif
1912
1913  /*
1914   * Do not hold the pool lock when obtaining the disk table.
1915   */
1916  dd = rtems_disk_obtain (dev);
1917  if (dd == NULL)
1918    return RTEMS_INVALID_ID;
1919
1920  pool = rtems_bdbuf_get_pool (dd->pool);
1921
1922  /*
1923   * Take the sync lock before locking the pool. Once we have the sync lock
1924   * we can lock the pool. If another thread has the sync lock it will cause
1925   * this thread to block until it owns the sync lock then it can own the
1926   * pool. The sync lock can only be obtained with the pool unlocked.
1927   */
1928 
1929  rtems_bdbuf_lock_sync (pool);
1930  rtems_bdbuf_lock_pool (pool); 
1931
1932  /*
1933   * Set the pool to have a sync active for a specific device and let the swap
1934   * out task know the id of the requester to wake when done.
1935   *
1936   * The swap out task will negate the sync active flag when no more buffers
1937   * for the device are held on the modified for sync queues.
1938   */
1939  pool->sync_active    = TRUE;
1940  pool->sync_requester = rtems_task_self ();
1941  pool->sync_device    = dev;
1942 
1943  rtems_bdbuf_wake_swapper ();
1944  rtems_bdbuf_unlock_pool (pool);
1945 
1946  sc = rtems_event_receive (RTEMS_BDBUF_TRANSFER_SYNC,
1947                            RTEMS_EVENT_ALL | RTEMS_WAIT,
1948                            0, &out);
1949
1950  if (sc != RTEMS_SUCCESSFUL)
1951    rtems_fatal_error_occurred (BLKDEV_FATAL_BDBUF_SWAPOUT_RE);
1952     
1953  rtems_bdbuf_unlock_sync (pool);
1954 
1955  return rtems_disk_release(dd);
1956}
1957
1958/**
1959 * Call back handler called by the low level driver when the transfer has
1960 * completed. This function may be invoked from interrupt handler.
1961 *
1962 * @param arg Arbitrary argument specified in block device request
1963 *            structure (in this case - pointer to the appropriate
1964 *            block device request structure).
1965 * @param status I/O completion status
1966 * @param error errno error code if status != RTEMS_SUCCESSFUL
1967 */
1968static void
1969rtems_bdbuf_write_done(void *arg, rtems_status_code status, int error)
1970{
1971  rtems_blkdev_request* req = (rtems_blkdev_request*) arg;
1972
1973  req->error = error;
1974  req->status = status;
1975
1976  rtems_event_send (req->io_task, RTEMS_BDBUF_TRANSFER_SYNC);
1977}
1978
1979/**
1980 * Process the modified list of buffers. There us a sync or modified list that
1981 * needs to be handled.
1982 *
1983 * @param pid The pool id to process modified buffers on.
1984 * @param dev The device to handle. If -1 no device is selected so select the
1985 *            device of the first buffer to be written to disk.
1986 * @param chain The modified chain to process.
1987 * @param transfer The chain to append buffers to be written too.
1988 * @param sync_active If TRUE this is a sync operation so expire all timers.
1989 * @param update_timers If TRUE update the timers.
1990 * @param timer_delta It update_timers is TRUE update the timers by this
1991 *                    amount.
1992 */
1993static void
1994rtems_bdbuf_swapout_modified_processing (rtems_bdpool_id      pid,
1995                                         dev_t*               dev,
1996                                         rtems_chain_control* chain,
1997                                         rtems_chain_control* transfer,
1998                                         boolean              sync_active,
1999                                         boolean              update_timers,
2000                                         uint32_t             timer_delta)
2001{
2002  if (!rtems_chain_is_empty (chain))
2003  {
2004    rtems_chain_node* node = rtems_chain_head (chain);
2005    node = node->next;
2006
2007    while (!rtems_chain_is_tail (chain, node))
2008    {
2009      rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
2010   
2011      if (bd->pool == pid)
2012      {
2013        /*
2014         * Check if the buffer's hold timer has reached 0. If a sync is active
2015         * force all the timers to 0.
2016         *
2017         * @note Lots of sync requests will skew this timer. It should be based
2018         *       on TOD to be accurate. Does it matter ?
2019         */
2020        if (sync_active)
2021          bd->hold_timer = 0;
2022 
2023        if (bd->hold_timer)
2024        {
2025          if (update_timers)
2026          {
2027            if (bd->hold_timer > timer_delta)
2028              bd->hold_timer -= timer_delta;
2029            else
2030              bd->hold_timer = 0;
2031          }
2032
2033          if (bd->hold_timer)
2034          {
2035            node = node->next;
2036            continue;
2037          }
2038        }
2039
2040        /*
2041         * This assumes we can set dev_t to -1 which is just an
2042         * assumption. Cannot use the transfer list being empty the sync dev
2043         * calls sets the dev to use.
2044         */
2045        if (*dev == (dev_t)-1)
2046          *dev = bd->dev;
2047
2048        if (bd->dev == *dev)
2049        {
2050          rtems_chain_node* next_node = node->next;
2051          rtems_chain_node* tnode = rtems_chain_tail (transfer);
2052   
2053          /*
2054           * The blocks on the transfer list are sorted in block order. This
2055           * means multi-block transfers for drivers that require consecutive
2056           * blocks perform better with sorted blocks and for real disks it may
2057           * help lower head movement.
2058           */
2059
2060          bd->state = RTEMS_BDBUF_STATE_TRANSFER;
2061
2062          rtems_chain_extract (node);
2063
2064          tnode = tnode->previous;
2065         
2066          while (node && !rtems_chain_is_head (transfer, tnode))
2067          {
2068            rtems_bdbuf_buffer* tbd = (rtems_bdbuf_buffer*) tnode;
2069
2070            if (bd->block > tbd->block)
2071            {
2072              rtems_chain_insert (tnode, node);
2073              node = NULL;
2074            }
2075            else
2076              tnode = tnode->previous;
2077          }
2078
2079          if (node)
2080            rtems_chain_prepend (transfer, node);
2081         
2082          node = next_node;
2083        }
2084        else
2085        {
2086          node = node->next;
2087        }
2088      }
2089    }
2090  }
2091}
2092
2093/**
2094 * Process a pool's modified buffers. Check the sync list first then the
2095 * modified list extracting the buffers suitable to be written to disk. We have
2096 * a device at a time. The task level loop will repeat this operation while
2097 * there are buffers to be written. If the transfer fails place the buffers
2098 * back on the modified list and try again later. The pool is unlocked while
2099 * the buffers are beign written to disk.
2100 *
2101 * @param pid The pool id to process modified buffers on.
2102 * @param timer_delta It update_timers is TRUE update the timers by this
2103 *                    amount.
2104 * @param update_timers If TRUE update the timers.
2105 * @param write_req The write request structure. There is only one.
2106 *
2107 * @retval TRUE Buffers where written to disk so scan again.
2108 * @retval FALSE No buffers where written to disk.
2109 */
2110static boolean
2111rtems_bdbuf_swapout_pool_processing (rtems_bdpool_id       pid,
2112                                     unsigned long         timer_delta,
2113                                     boolean               update_timers,
2114                                     rtems_blkdev_request* write_req)
2115{
2116  rtems_bdbuf_pool*   pool = rtems_bdbuf_get_pool (pid);
2117  rtems_chain_control transfer;
2118  dev_t               dev = -1;
2119  rtems_disk_device*  dd;
2120  boolean             transfered_buffers = TRUE;
2121
2122  rtems_chain_initialize_empty (&transfer);
2123   
2124  rtems_bdbuf_lock_pool (pool);
2125
2126  /*
2127   * When the sync is for a device limit the sync to that device. If the sync
2128   * is for a buffer handle the devices in the order on the sync list. This
2129   * means the dev is -1.
2130   */
2131  if (pool->sync_active)
2132    dev = pool->sync_device;
2133
2134  /*
2135   * If we have any buffers in the sync queue move then to the modified
2136   * list. The first sync buffer will select the device we use.
2137   */
2138  rtems_bdbuf_swapout_modified_processing (pid, &dev,
2139                                           &pool->sync, &transfer,
2140                                           TRUE, FALSE,
2141                                           timer_delta);
2142
2143  /*
2144   * Process the pool's modified list.
2145   */
2146  rtems_bdbuf_swapout_modified_processing (pid, &dev,
2147                                           &pool->modified, &transfer,
2148                                           pool->sync_active,
2149                                           update_timers,
2150                                           timer_delta);
2151
2152  /*
2153   * We have all the buffers that have been modified for this device so
2154   * the pool can be unlocked because the state is set to TRANSFER.
2155   */
2156
2157  rtems_bdbuf_unlock_pool (pool);
2158
2159  /*
2160   * If there are buffers to transfer to the media tranfer them.
2161   */
2162  if (rtems_chain_is_empty (&transfer))
2163    transfered_buffers = FALSE;
2164  else
2165  {
2166    /*
2167     * Obtain the disk device. Release the pool mutex to avoid a dead
2168     * lock.
2169     */
2170    dd = rtems_disk_obtain (dev);
2171    if (dd == NULL)
2172       transfered_buffers = FALSE;
2173    else
2174    {
2175      /*
2176       * The last block number used when the driver only supports
2177       * continuous blocks in a single request.
2178       */
2179      uint32_t last_block = 0;
2180     
2181      /*
2182       * Take as many buffers as configured and pass to the driver. Note, the
2183       * API to the drivers has the array of buffers and if a chain was passed
2184       * we could have just passed the list. If the driver API is updated it
2185       * should be possible to make this change with little effect in this
2186       * code. The array that is passed is broken in design and should be
2187       * removed. Merging to members of a struct into the first member is
2188       * trouble waiting to happen.
2189       */
2190
2191      write_req->status = RTEMS_RESOURCE_IN_USE;
2192      write_req->error = 0;
2193      write_req->bufnum = 0;
2194
2195      while (!rtems_chain_is_empty (&transfer))
2196      {
2197        rtems_bdbuf_buffer* bd =
2198          (rtems_bdbuf_buffer*) rtems_chain_get (&transfer);
2199
2200        boolean write = FALSE;
2201       
2202        /*
2203         * If the device only accepts sequential buffers and this is not the
2204         * first buffer (the first is always sequential, and the buffer is not
2205         * sequential then put the buffer back on the transfer chain and write
2206         * the committed buffers.
2207         */
2208       
2209        if ((dd->capabilities & RTEMS_BLKDEV_CAP_MULTISECTOR_CONT) &&
2210            write_req->bufnum &&
2211            (bd->block != (last_block + 1)))
2212        {
2213          rtems_chain_prepend (&transfer, &bd->link);
2214          write = TRUE;
2215        }
2216        else
2217        {
2218          write_req->bufs[write_req->bufnum].user   = bd;
2219          write_req->bufs[write_req->bufnum].block  = bd->block;
2220          write_req->bufs[write_req->bufnum].length = dd->block_size;
2221          write_req->bufs[write_req->bufnum].buffer = bd->buffer;
2222          write_req->bufnum++;
2223          last_block = bd->block;
2224        }
2225
2226        /*
2227         * Perform the transfer if there are no more buffers, or the transfer
2228         * size has reached the configured max. value.
2229         */
2230
2231        if (rtems_chain_is_empty (&transfer) ||
2232            (write_req->bufnum >= rtems_bdbuf_configuration.max_write_blocks))
2233          write = TRUE;
2234
2235        if (write)
2236        {
2237          int result;
2238          uint32_t b;
2239
2240          /*
2241           * Perform the transfer. No pool locks, no preemption, only the disk
2242           * device is being held.
2243           */
2244          result = dd->ioctl (dd->phys_dev->dev,
2245                              RTEMS_BLKIO_REQUEST, write_req);
2246
2247          if (result < 0)
2248          {
2249            rtems_bdbuf_lock_pool (pool);
2250             
2251            for (b = 0; b < write_req->bufnum; b++)
2252            {
2253              bd = write_req->bufs[b].user;
2254              bd->state  = RTEMS_BDBUF_STATE_MODIFIED;
2255              bd->error = errno;
2256
2257              /*
2258               * Place back on the pools modified queue and try again.
2259               *
2260               * @warning Not sure this is the best option but I do not know
2261               *          what else can be done.
2262               */
2263              rtems_chain_append (&pool->modified, &bd->link);
2264            }
2265          }
2266          else
2267          {
2268            rtems_status_code sc = 0;
2269            rtems_event_set   out;
2270
2271            sc = rtems_event_receive (RTEMS_BDBUF_TRANSFER_SYNC,
2272                                      RTEMS_EVENT_ALL | RTEMS_WAIT,
2273                                      0, &out);
2274
2275            if (sc != RTEMS_SUCCESSFUL)
2276              rtems_fatal_error_occurred (BLKDEV_FATAL_BDBUF_SWAPOUT_RE);
2277
2278            rtems_bdbuf_lock_pool (pool);
2279
2280            for (b = 0; b < write_req->bufnum; b++)
2281            {
2282              bd = write_req->bufs[b].user;
2283              bd->state = RTEMS_BDBUF_STATE_CACHED;
2284              bd->error = 0;
2285
2286              rtems_chain_append (&pool->lru, &bd->link);
2287             
2288              if (bd->waiters)
2289                rtems_bdbuf_wake (pool->transfer, &pool->transfer_waiters);
2290              else
2291              {
2292                if (rtems_chain_has_only_one_node (&pool->lru))
2293                  rtems_bdbuf_wake (pool->waiting, &pool->wait_waiters);
2294              }
2295            }
2296          }
2297
2298          rtems_bdbuf_unlock_pool (pool);
2299
2300          write_req->status = RTEMS_RESOURCE_IN_USE;
2301          write_req->error = 0;
2302          write_req->bufnum = 0;
2303        }
2304      }
2305         
2306      rtems_disk_release (dd);
2307    }
2308  }
2309
2310  if (pool->sync_active && !  transfered_buffers)
2311  {
2312    rtems_id sync_requester = pool->sync_requester;
2313    pool->sync_active = FALSE;
2314    pool->sync_requester = 0;
2315    if (sync_requester)
2316      rtems_event_send (sync_requester, RTEMS_BDBUF_TRANSFER_SYNC);
2317  }
2318 
2319  return  transfered_buffers;
2320}
2321
2322/**
2323 * Body of task which takes care on flushing modified buffers to the disk.
2324 *
2325 * @param arg The task argument which is the context.
2326 */
2327static rtems_task
2328rtems_bdbuf_swapout_task (rtems_task_argument arg)
2329{
2330  rtems_bdbuf_context*  context = (rtems_bdbuf_context*) arg;
2331  rtems_blkdev_request* write_req;
2332  uint32_t              period_in_ticks;
2333  const uint32_t        period_in_msecs = rtems_bdbuf_configuration.swapout_period;
2334  uint32_t              timer_delta;
2335  rtems_status_code     sc;
2336
2337  /*
2338   * @note chrisj The rtems_blkdev_request and the array at the end is a hack.
2339   * I am disappointment at finding code like this in RTEMS. The request should
2340   * have been a rtems_chain_control. Simple, fast and less storage as the node
2341   * is already part of the buffer structure.
2342   */
2343  write_req =
2344    malloc (sizeof (rtems_blkdev_request) +
2345            (rtems_bdbuf_configuration.max_write_blocks *
2346             sizeof (rtems_blkdev_sg_buffer)));
2347
2348  if (!write_req)
2349    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_NOMEM);
2350
2351  write_req->req = RTEMS_BLKDEV_REQ_WRITE;
2352  write_req->req_done = rtems_bdbuf_write_done;
2353  write_req->done_arg = write_req;
2354  write_req->io_task = rtems_task_self ();
2355
2356  period_in_ticks = TOD_MICROSECONDS_TO_TICKS (period_in_msecs * 1000);
2357
2358  /*
2359   * This is temporary. Needs to be changed to use the real time clock.
2360   */
2361  timer_delta = period_in_msecs;
2362
2363  while (context->swapout_enabled)
2364  {
2365    rtems_event_set out;
2366
2367    /*
2368     * Only update the timers once in the processing cycle.
2369     */
2370    boolean update_timers = TRUE;
2371   
2372    /*
2373     * If we write buffers to any disk perform a check again. We only write a
2374     * single device at a time and a pool may have more than one devices
2375     * buffers modified waiting to be written.
2376     */
2377    boolean transfered_buffers;
2378
2379    do
2380    {
2381      rtems_bdpool_id pid;
2382   
2383      transfered_buffers = FALSE;
2384
2385      /*
2386       * Loop over each pool extacting all the buffers we find for a specific
2387       * device. The device is the first one we find on a modified list of a
2388       * pool. Process the sync queue of buffers first.
2389       */
2390      for (pid = 0; pid < context->npools; pid++)
2391      {
2392        if (rtems_bdbuf_swapout_pool_processing (pid,
2393                                                 timer_delta,
2394                                                 update_timers,
2395                                                 write_req))
2396        {
2397          transfered_buffers = TRUE;
2398        }
2399      }
2400
2401      /*
2402       * Only update the timers once.
2403       */
2404      update_timers = FALSE;
2405    }
2406    while (transfered_buffers);
2407
2408    sc = rtems_event_receive (RTEMS_BDBUF_SWAPOUT_SYNC,
2409                              RTEMS_EVENT_ALL | RTEMS_WAIT,
2410                              period_in_ticks,
2411                              &out);
2412
2413    if ((sc != RTEMS_SUCCESSFUL) && (sc != RTEMS_TIMEOUT))
2414      rtems_fatal_error_occurred (BLKDEV_FATAL_BDBUF_SWAPOUT_RE);
2415  }
2416
2417  free (write_req);
2418
2419  rtems_task_delete (RTEMS_SELF);
2420}
2421
2422/**
2423 * Find first appropriate buffer pool. This primitive returns the index of
2424 * first buffer pool which block size is greater than or equal to specified
2425 * size.
2426 *
2427 * @param block_size Requested block size
2428 * @param pool The pool to use for the requested pool size.
2429 *
2430 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
2431 *         successfully or error code if error is occured)
2432 * @retval RTEMS_INVALID_SIZE The specified block size is invalid (not a power
2433 *         of 2)
2434 * @retval RTEMS_NOT_DEFINED The buffer pool for this or greater block size
2435 *         is not configured.
2436 */
2437rtems_status_code
2438rtems_bdbuf_find_pool (uint32_t block_size, rtems_bdpool_id *pool)
2439{
2440  rtems_bdbuf_pool* p;
2441  rtems_bdpool_id   i;
2442  rtems_bdpool_id   curid = -1;
2443  bool              found = FALSE;
2444  uint32_t          cursize = UINT_MAX;
2445  int               j;
2446
2447  for (j = block_size; (j != 0) && ((j & 1) == 0); j >>= 1);
2448  if (j != 1)
2449    return RTEMS_INVALID_SIZE;
2450
2451  for (i = 0; i < rtems_bdbuf_ctx.npools; i++)
2452  {
2453    p = rtems_bdbuf_get_pool (i);
2454    if ((p->blksize >= block_size) &&
2455        (p->blksize < cursize))
2456    {
2457      curid = i;
2458      cursize = p->blksize;
2459      found = TRUE;
2460    }
2461  }
2462
2463  if (found)
2464  {
2465    if (pool != NULL)
2466      *pool = curid;
2467    return RTEMS_SUCCESSFUL;
2468  }
2469  else
2470  {
2471    return RTEMS_NOT_DEFINED;
2472  }
2473}
2474
2475/**
2476 * Obtain characteristics of buffer pool with specified number.
2477 *
2478 * @param pool Buffer pool number
2479 * @param block_size Block size for which buffer pool is configured returned
2480 *                   there
2481 * @param blocks Number of buffers in buffer pool.
2482 *
2483 * RETURNS:
2484 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
2485 *         successfully or error code if error is occured)
2486 * @retval RTEMS_INVALID_SIZE The appropriate buffer pool is not configured.
2487 *
2488 * @note Buffer pools enumerated continuously starting from 0.
2489 */
2490rtems_status_code
2491rtems_bdbuf_get_pool_info (rtems_bdpool_id pool, int* block_size, int* blocks)
2492{
2493  if (pool >= rtems_bdbuf_ctx.npools)
2494    return RTEMS_INVALID_NUMBER;
2495
2496  if (block_size != NULL)
2497  {
2498    *block_size = rtems_bdbuf_ctx.pool[pool].blksize;
2499  }
2500
2501  if (blocks != NULL)
2502  {
2503    *blocks = rtems_bdbuf_ctx.pool[pool].nblks;
2504  }
2505
2506  return RTEMS_SUCCESSFUL;
2507}
Note: See TracBrowser for help on using the repository browser.