source: rtems/cpukit/libblock/src/bdbuf.c @ 9f527308

4.115
Last change on this file since 9f527308 was 9f527308, checked in by Sebastian Huber <sebastian.huber@…>, on Jun 12, 2012 at 7:46:09 AM

libblock: Add block device statistics

  • Property mode set to 100644
File size: 83.7 KB
Line 
1/**
2 * @file
3 *
4 * @ingroup rtems_bdbuf
5 *
6 * Block device buffer management.
7 */
8
9/*
10 * Disk I/O buffering
11 * Buffer managment
12 *
13 * Copyright (C) 2001 OKTET Ltd., St.-Peterburg, Russia
14 * Author: Andrey G. Ivanov <Andrey.Ivanov@oktet.ru>
15 *         Victor V. Vengerov <vvv@oktet.ru>
16 *         Alexander Kukuta <kam@oktet.ru>
17 *
18 * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org>
19 *    Rewritten to remove score mutex access. Fixes many performance
20 *    issues.
21 *
22 * Copyright (c) 2009-2012 embedded brains GmbH.
23 *
24 * @(#) bdbuf.c,v 1.14 2004/04/17 08:15:17 ralf Exp
25 */
26
27/**
28 * Set to 1 to enable debug tracing.
29 */
30#define RTEMS_BDBUF_TRACE 0
31
32#if HAVE_CONFIG_H
33#include "config.h"
34#endif
35#include <limits.h>
36#include <errno.h>
37#include <stdio.h>
38#include <string.h>
39#include <inttypes.h>
40
41#include <rtems.h>
42#include <rtems/error.h>
43#include <rtems/malloc.h>
44
45#include "rtems/bdbuf.h"
46
47#define BDBUF_INVALID_DEV NULL
48
49/*
50 * Simpler label for this file.
51 */
52#define bdbuf_config rtems_bdbuf_configuration
53
54/**
55 * A swapout transfer transaction data. This data is passed to a worked thread
56 * to handle the write phase of the transfer.
57 */
58typedef struct rtems_bdbuf_swapout_transfer
59{
60  rtems_chain_control   bds;         /**< The transfer list of BDs. */
61  rtems_disk_device    *dd;          /**< The device the transfer is for. */
62  bool                  syncing;     /**< The data is a sync'ing. */
63  rtems_blkdev_request* write_req;   /**< The write request array. */
64  uint32_t              bufs_per_bd; /**< Number of buffers per bd. */
65} rtems_bdbuf_swapout_transfer;
66
67/**
68 * Swapout worker thread. These are available to take processing from the
69 * main swapout thread and handle the I/O operation.
70 */
71typedef struct rtems_bdbuf_swapout_worker
72{
73  rtems_chain_node             link;     /**< The threads sit on a chain when
74                                          * idle. */
75  rtems_id                     id;       /**< The id of the task so we can wake
76                                          * it. */
77  bool                         enabled;  /**< The worker is enabled. */
78  rtems_bdbuf_swapout_transfer transfer; /**< The transfer data for this
79                                          * thread. */
80} rtems_bdbuf_swapout_worker;
81
82/**
83 * Buffer waiters synchronization.
84 */
85typedef struct rtems_bdbuf_waiters {
86  unsigned count;
87  rtems_id sema;
88} rtems_bdbuf_waiters;
89
90/**
91 * The BD buffer cache.
92 */
93typedef struct rtems_bdbuf_cache
94{
95  rtems_id            swapout;           /**< Swapout task ID */
96  bool                swapout_enabled;   /**< Swapout is only running if
97                                          * enabled. Set to false to kill the
98                                          * swap out task. It deletes itself. */
99  rtems_chain_control swapout_workers;   /**< The work threads for the swapout
100                                          * task. */
101
102  rtems_bdbuf_buffer* bds;               /**< Pointer to table of buffer
103                                          * descriptors. */
104  void*               buffers;           /**< The buffer's memory. */
105  size_t              buffer_min_count;  /**< Number of minimum size buffers
106                                          * that fit the buffer memory. */
107  size_t              max_bds_per_group; /**< The number of BDs of minimum
108                                          * buffer size that fit in a group. */
109  uint32_t            flags;             /**< Configuration flags. */
110
111  rtems_id            lock;              /**< The cache lock. It locks all
112                                          * cache data, BD and lists. */
113  rtems_id            sync_lock;         /**< Sync calls block writes. */
114  bool                sync_active;       /**< True if a sync is active. */
115  rtems_id            sync_requester;    /**< The sync requester. */
116  rtems_disk_device  *sync_device;       /**< The device to sync and
117                                          * BDBUF_INVALID_DEV not a device
118                                          * sync. */
119
120  rtems_bdbuf_buffer* tree;              /**< Buffer descriptor lookup AVL tree
121                                          * root. There is only one. */
122  rtems_chain_control lru;               /**< Least recently used list */
123  rtems_chain_control modified;          /**< Modified buffers list */
124  rtems_chain_control sync;              /**< Buffers to sync list */
125
126  rtems_bdbuf_waiters access_waiters;    /**< Wait for a buffer in
127                                          * ACCESS_CACHED, ACCESS_MODIFIED or
128                                          * ACCESS_EMPTY
129                                          * state. */
130  rtems_bdbuf_waiters transfer_waiters;  /**< Wait for a buffer in TRANSFER
131                                          * state. */
132  rtems_bdbuf_waiters buffer_waiters;    /**< Wait for a buffer and no one is
133                                          * available. */
134
135  size_t              group_count;       /**< The number of groups. */
136  rtems_bdbuf_group*  groups;            /**< The groups. */
137  rtems_id            read_ahead_task;   /**< Read-ahead task */
138  rtems_chain_control read_ahead_chain;  /**< Read-ahead request chain */
139  bool                read_ahead_enabled; /**< Read-ahead enabled */
140
141  bool                initialised;       /**< Initialised state. */
142} rtems_bdbuf_cache;
143
144/**
145 * Fatal errors
146 */
147#define RTEMS_BLKDEV_FATAL_ERROR(n) \
148  (((uint32_t)'B' << 24) | ((uint32_t)(n) & (uint32_t)0x00FFFFFF))
149
150#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_11      RTEMS_BLKDEV_FATAL_ERROR(1)
151#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_4       RTEMS_BLKDEV_FATAL_ERROR(2)
152#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_5       RTEMS_BLKDEV_FATAL_ERROR(3)
153#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_6       RTEMS_BLKDEV_FATAL_ERROR(4)
154#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_7       RTEMS_BLKDEV_FATAL_ERROR(5)
155#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_8       RTEMS_BLKDEV_FATAL_ERROR(6)
156#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_9       RTEMS_BLKDEV_FATAL_ERROR(7)
157#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_10      RTEMS_BLKDEV_FATAL_ERROR(8)
158#define RTEMS_BLKDEV_FATAL_BDBUF_TREE_RM       RTEMS_BLKDEV_FATAL_ERROR(9)
159#define RTEMS_BLKDEV_FATAL_BDBUF_SWAPOUT       RTEMS_BLKDEV_FATAL_ERROR(10)
160
161/*
162 * The lock/unlock fatal errors occur in case the bdbuf is not initialized with
163 * rtems_bdbuf_init().  General system corruption like stack overflow etc. may
164 * also trigger these fatal errors.
165 */
166#define RTEMS_BLKDEV_FATAL_BDBUF_SYNC_LOCK     RTEMS_BLKDEV_FATAL_ERROR(11)
167#define RTEMS_BLKDEV_FATAL_BDBUF_SYNC_UNLOCK   RTEMS_BLKDEV_FATAL_ERROR(12)
168#define RTEMS_BLKDEV_FATAL_BDBUF_CACHE_LOCK    RTEMS_BLKDEV_FATAL_ERROR(13)
169#define RTEMS_BLKDEV_FATAL_BDBUF_CACHE_UNLOCK  RTEMS_BLKDEV_FATAL_ERROR(14)
170
171#define RTEMS_BLKDEV_FATAL_BDBUF_PREEMPT_DIS   RTEMS_BLKDEV_FATAL_ERROR(15)
172#define RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAIT_2  RTEMS_BLKDEV_FATAL_ERROR(16)
173#define RTEMS_BLKDEV_FATAL_BDBUF_PREEMPT_RST   RTEMS_BLKDEV_FATAL_ERROR(17)
174#define RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAIT_TO RTEMS_BLKDEV_FATAL_ERROR(18)
175#define RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAKE    RTEMS_BLKDEV_FATAL_ERROR(19)
176#define RTEMS_BLKDEV_FATAL_BDBUF_SO_WAKE       RTEMS_BLKDEV_FATAL_ERROR(20)
177#define RTEMS_BLKDEV_FATAL_BDBUF_SO_NOMEM      RTEMS_BLKDEV_FATAL_ERROR(21)
178#define RTEMS_BLKDEV_FATAL_BDBUF_SO_WK_CREATE  RTEMS_BLKDEV_FATAL_ERROR(22)
179#define BLKDEV_FATAL_BDBUF_SWAPOUT_RE          RTEMS_BLKDEV_FATAL_ERROR(24)
180#define BLKDEV_FATAL_BDBUF_SWAPOUT_TS          RTEMS_BLKDEV_FATAL_ERROR(25)
181#define RTEMS_BLKDEV_FATAL_BDBUF_WAIT_EVNT     RTEMS_BLKDEV_FATAL_ERROR(26)
182#define RTEMS_BLKDEV_FATAL_BDBUF_RECYCLE       RTEMS_BLKDEV_FATAL_ERROR(27)
183#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_0       RTEMS_BLKDEV_FATAL_ERROR(28)
184#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_1       RTEMS_BLKDEV_FATAL_ERROR(29)
185#define RTEMS_BLKDEV_FATAL_BDBUF_STATE_2       RTEMS_BLKDEV_FATAL_ERROR(30)
186#define RTEMS_BLKDEV_FATAL_BDBUF_RA_WAKE_UP    RTEMS_BLKDEV_FATAL_ERROR(31)
187
188/**
189 * The events used in this code. These should be system events rather than
190 * application events.
191 */
192#define RTEMS_BDBUF_TRANSFER_SYNC  RTEMS_EVENT_1
193#define RTEMS_BDBUF_SWAPOUT_SYNC   RTEMS_EVENT_2
194#define RTEMS_BDBUF_READ_AHEAD_WAKE_UP RTEMS_EVENT_1
195
196/**
197 * Lock semaphore attributes. This is used for locking type mutexes.
198 *
199 * @warning Priority inheritance is on.
200 */
201#define RTEMS_BDBUF_CACHE_LOCK_ATTRIBS \
202  (RTEMS_PRIORITY | RTEMS_BINARY_SEMAPHORE | \
203   RTEMS_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
204
205/**
206 * Waiter semaphore attributes.
207 *
208 * @warning Do not configure as inherit priority. If a driver is in the driver
209 *          initialisation table this locked semaphore will have the IDLE task
210 *          as the holder and a blocking task will raise the priority of the
211 *          IDLE task which can cause unsual side effects.
212 */
213#define RTEMS_BDBUF_CACHE_WAITER_ATTRIBS \
214  (RTEMS_PRIORITY | RTEMS_SIMPLE_BINARY_SEMAPHORE | \
215   RTEMS_NO_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
216
217/**
218 * Waiter timeout. Set to non-zero to find some info on a waiter that is
219 * waiting too long.
220 */
221#define RTEMS_BDBUF_WAIT_TIMEOUT RTEMS_NO_TIMEOUT
222#if !defined (RTEMS_BDBUF_WAIT_TIMEOUT)
223#define RTEMS_BDBUF_WAIT_TIMEOUT \
224  (TOD_MICROSECONDS_TO_TICKS (20000000))
225#endif
226
227static rtems_task rtems_bdbuf_swapout_task(rtems_task_argument arg);
228
229static rtems_task rtems_bdbuf_read_ahead_task(rtems_task_argument arg);
230
231/**
232 * The Buffer Descriptor cache.
233 */
234static rtems_bdbuf_cache bdbuf_cache;
235
236#if RTEMS_BDBUF_TRACE
237/**
238 * If true output the trace message.
239 */
240bool rtems_bdbuf_tracer;
241
242/**
243 * Return the number of items on the list.
244 *
245 * @param list The chain control.
246 * @return uint32_t The number of items on the list.
247 */
248uint32_t
249rtems_bdbuf_list_count (rtems_chain_control* list)
250{
251  rtems_chain_node* node = rtems_chain_first (list);
252  uint32_t          count = 0;
253  while (!rtems_chain_is_tail (list, node))
254  {
255    count++;
256    node = rtems_chain_next (node);
257  }
258  return count;
259}
260
261/**
262 * Show the usage for the bdbuf cache.
263 */
264void
265rtems_bdbuf_show_usage (void)
266{
267  uint32_t group;
268  uint32_t total = 0;
269  uint32_t val;
270
271  for (group = 0; group < bdbuf_cache.group_count; group++)
272    total += bdbuf_cache.groups[group].users;
273  printf ("bdbuf:group users=%lu", total);
274  val = rtems_bdbuf_list_count (&bdbuf_cache.lru);
275  printf (", lru=%lu", val);
276  total = val;
277  val = rtems_bdbuf_list_count (&bdbuf_cache.modified);
278  printf (", mod=%lu", val);
279  total += val;
280  val = rtems_bdbuf_list_count (&bdbuf_cache.sync);
281  printf (", sync=%lu", val);
282  total += val;
283  printf (", total=%lu\n", total);
284}
285
286/**
287 * Show the users for a group of a bd.
288 *
289 * @param where A label to show the context of output.
290 * @param bd The bd to show the users of.
291 */
292void
293rtems_bdbuf_show_users (const char* where, rtems_bdbuf_buffer* bd)
294{
295  const char* states[] =
296    { "FR", "EM", "CH", "AC", "AM", "AE", "AP", "MD", "SY", "TR", "TP" };
297
298  printf ("bdbuf:users: %15s: [%" PRIu32 " (%s)] %td:%td = %" PRIu32 " %s\n",
299          where,
300          bd->block, states[bd->state],
301          bd->group - bdbuf_cache.groups,
302          bd - bdbuf_cache.bds,
303          bd->group->users,
304          bd->group->users > 8 ? "<<<<<<<" : "");
305}
306#else
307#define rtems_bdbuf_tracer (0)
308#define rtems_bdbuf_show_usage() ((void) 0)
309#define rtems_bdbuf_show_users(_w, _b) ((void) 0)
310#endif
311
312/**
313 * The default maximum height of 32 allows for AVL trees having between
314 * 5,704,880 and 4,294,967,295 nodes, depending on order of insertion.  You may
315 * change this compile-time constant as you wish.
316 */
317#ifndef RTEMS_BDBUF_AVL_MAX_HEIGHT
318#define RTEMS_BDBUF_AVL_MAX_HEIGHT (32)
319#endif
320
321static void
322rtems_bdbuf_fatal (rtems_bdbuf_buf_state state, uint32_t error)
323{
324  rtems_fatal_error_occurred ((((uint32_t) state) << 16) | error);
325}
326
327/**
328 * Searches for the node with specified dd/block.
329 *
330 * @param root pointer to the root node of the AVL-Tree
331 * @param dd disk device search key
332 * @param block block search key
333 * @retval NULL node with the specified dd/block is not found
334 * @return pointer to the node with specified dd/block
335 */
336static rtems_bdbuf_buffer *
337rtems_bdbuf_avl_search (rtems_bdbuf_buffer** root,
338                        const rtems_disk_device *dd,
339                        rtems_blkdev_bnum    block)
340{
341  rtems_bdbuf_buffer* p = *root;
342
343  while ((p != NULL) && ((p->dd != dd) || (p->block != block)))
344  {
345    if (((uintptr_t) p->dd < (uintptr_t) dd)
346        || ((p->dd == dd) && (p->block < block)))
347    {
348      p = p->avl.right;
349    }
350    else
351    {
352      p = p->avl.left;
353    }
354  }
355
356  return p;
357}
358
359/**
360 * Inserts the specified node to the AVl-Tree.
361 *
362 * @param root pointer to the root node of the AVL-Tree
363 * @param node Pointer to the node to add.
364 * @retval 0 The node added successfully
365 * @retval -1 An error occured
366 */
367static int
368rtems_bdbuf_avl_insert(rtems_bdbuf_buffer** root,
369                       rtems_bdbuf_buffer*  node)
370{
371  const rtems_disk_device *dd = node->dd;
372  rtems_blkdev_bnum block = node->block;
373
374  rtems_bdbuf_buffer*  p = *root;
375  rtems_bdbuf_buffer*  q;
376  rtems_bdbuf_buffer*  p1;
377  rtems_bdbuf_buffer*  p2;
378  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
379  rtems_bdbuf_buffer** buf_prev = buf_stack;
380
381  bool modified = false;
382
383  if (p == NULL)
384  {
385    *root = node;
386    node->avl.left = NULL;
387    node->avl.right = NULL;
388    node->avl.bal = 0;
389    return 0;
390  }
391
392  while (p != NULL)
393  {
394    *buf_prev++ = p;
395
396    if (((uintptr_t) p->dd < (uintptr_t) dd)
397        || ((p->dd == dd) && (p->block < block)))
398    {
399      p->avl.cache = 1;
400      q = p->avl.right;
401      if (q == NULL)
402      {
403        q = node;
404        p->avl.right = q = node;
405        break;
406      }
407    }
408    else if ((p->dd != dd) || (p->block != block))
409    {
410      p->avl.cache = -1;
411      q = p->avl.left;
412      if (q == NULL)
413      {
414        q = node;
415        p->avl.left = q;
416        break;
417      }
418    }
419    else
420    {
421      return -1;
422    }
423
424    p = q;
425  }
426
427  q->avl.left = q->avl.right = NULL;
428  q->avl.bal = 0;
429  modified = true;
430  buf_prev--;
431
432  while (modified)
433  {
434    if (p->avl.cache == -1)
435    {
436      switch (p->avl.bal)
437      {
438        case 1:
439          p->avl.bal = 0;
440          modified = false;
441          break;
442
443        case 0:
444          p->avl.bal = -1;
445          break;
446
447        case -1:
448          p1 = p->avl.left;
449          if (p1->avl.bal == -1) /* simple LL-turn */
450          {
451            p->avl.left = p1->avl.right;
452            p1->avl.right = p;
453            p->avl.bal = 0;
454            p = p1;
455          }
456          else /* double LR-turn */
457          {
458            p2 = p1->avl.right;
459            p1->avl.right = p2->avl.left;
460            p2->avl.left = p1;
461            p->avl.left = p2->avl.right;
462            p2->avl.right = p;
463            if (p2->avl.bal == -1) p->avl.bal = +1; else p->avl.bal = 0;
464            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
465            p = p2;
466          }
467          p->avl.bal = 0;
468          modified = false;
469          break;
470
471        default:
472          break;
473      }
474    }
475    else
476    {
477      switch (p->avl.bal)
478      {
479        case -1:
480          p->avl.bal = 0;
481          modified = false;
482          break;
483
484        case 0:
485          p->avl.bal = 1;
486          break;
487
488        case 1:
489          p1 = p->avl.right;
490          if (p1->avl.bal == 1) /* simple RR-turn */
491          {
492            p->avl.right = p1->avl.left;
493            p1->avl.left = p;
494            p->avl.bal = 0;
495            p = p1;
496          }
497          else /* double RL-turn */
498          {
499            p2 = p1->avl.left;
500            p1->avl.left = p2->avl.right;
501            p2->avl.right = p1;
502            p->avl.right = p2->avl.left;
503            p2->avl.left = p;
504            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
505            if (p2->avl.bal == -1) p1->avl.bal = +1; else p1->avl.bal = 0;
506            p = p2;
507          }
508          p->avl.bal = 0;
509          modified = false;
510          break;
511
512        default:
513          break;
514      }
515    }
516    q = p;
517    if (buf_prev > buf_stack)
518    {
519      p = *--buf_prev;
520
521      if (p->avl.cache == -1)
522      {
523        p->avl.left = q;
524      }
525      else
526      {
527        p->avl.right = q;
528      }
529    }
530    else
531    {
532      *root = p;
533      break;
534    }
535  };
536
537  return 0;
538}
539
540
541/**
542 * Removes the node from the tree.
543 *
544 * @param root Pointer to pointer to the root node
545 * @param node Pointer to the node to remove
546 * @retval 0 Item removed
547 * @retval -1 No such item found
548 */
549static int
550rtems_bdbuf_avl_remove(rtems_bdbuf_buffer**      root,
551                       const rtems_bdbuf_buffer* node)
552{
553  const rtems_disk_device *dd = node->dd;
554  rtems_blkdev_bnum block = node->block;
555
556  rtems_bdbuf_buffer*  p = *root;
557  rtems_bdbuf_buffer*  q;
558  rtems_bdbuf_buffer*  r;
559  rtems_bdbuf_buffer*  s;
560  rtems_bdbuf_buffer*  p1;
561  rtems_bdbuf_buffer*  p2;
562  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
563  rtems_bdbuf_buffer** buf_prev = buf_stack;
564
565  bool modified = false;
566
567  memset (buf_stack, 0, sizeof(buf_stack));
568
569  while (p != NULL)
570  {
571    *buf_prev++ = p;
572
573    if (((uintptr_t) p->dd < (uintptr_t) dd)
574        || ((p->dd == dd) && (p->block < block)))
575    {
576      p->avl.cache = 1;
577      p = p->avl.right;
578    }
579    else if ((p->dd != dd) || (p->block != block))
580    {
581      p->avl.cache = -1;
582      p = p->avl.left;
583    }
584    else
585    {
586      /* node found */
587      break;
588    }
589  }
590
591  if (p == NULL)
592  {
593    /* there is no such node */
594    return -1;
595  }
596
597  q = p;
598
599  buf_prev--;
600  if (buf_prev > buf_stack)
601  {
602    p = *(buf_prev - 1);
603  }
604  else
605  {
606    p = NULL;
607  }
608
609  /* at this moment q - is a node to delete, p is q's parent */
610  if (q->avl.right == NULL)
611  {
612    r = q->avl.left;
613    if (r != NULL)
614    {
615      r->avl.bal = 0;
616    }
617    q = r;
618  }
619  else
620  {
621    rtems_bdbuf_buffer **t;
622
623    r = q->avl.right;
624
625    if (r->avl.left == NULL)
626    {
627      r->avl.left = q->avl.left;
628      r->avl.bal = q->avl.bal;
629      r->avl.cache = 1;
630      *buf_prev++ = q = r;
631    }
632    else
633    {
634      t = buf_prev++;
635      s = r;
636
637      while (s->avl.left != NULL)
638      {
639        *buf_prev++ = r = s;
640        s = r->avl.left;
641        r->avl.cache = -1;
642      }
643
644      s->avl.left = q->avl.left;
645      r->avl.left = s->avl.right;
646      s->avl.right = q->avl.right;
647      s->avl.bal = q->avl.bal;
648      s->avl.cache = 1;
649
650      *t = q = s;
651    }
652  }
653
654  if (p != NULL)
655  {
656    if (p->avl.cache == -1)
657    {
658      p->avl.left = q;
659    }
660    else
661    {
662      p->avl.right = q;
663    }
664  }
665  else
666  {
667    *root = q;
668  }
669
670  modified = true;
671
672  while (modified)
673  {
674    if (buf_prev > buf_stack)
675    {
676      p = *--buf_prev;
677    }
678    else
679    {
680      break;
681    }
682
683    if (p->avl.cache == -1)
684    {
685      /* rebalance left branch */
686      switch (p->avl.bal)
687      {
688        case -1:
689          p->avl.bal = 0;
690          break;
691        case  0:
692          p->avl.bal = 1;
693          modified = false;
694          break;
695
696        case +1:
697          p1 = p->avl.right;
698
699          if (p1->avl.bal >= 0) /* simple RR-turn */
700          {
701            p->avl.right = p1->avl.left;
702            p1->avl.left = p;
703
704            if (p1->avl.bal == 0)
705            {
706              p1->avl.bal = -1;
707              modified = false;
708            }
709            else
710            {
711              p->avl.bal = 0;
712              p1->avl.bal = 0;
713            }
714            p = p1;
715          }
716          else /* double RL-turn */
717          {
718            p2 = p1->avl.left;
719
720            p1->avl.left = p2->avl.right;
721            p2->avl.right = p1;
722            p->avl.right = p2->avl.left;
723            p2->avl.left = p;
724
725            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
726            if (p2->avl.bal == -1) p1->avl.bal = 1; else p1->avl.bal = 0;
727
728            p = p2;
729            p2->avl.bal = 0;
730          }
731          break;
732
733        default:
734          break;
735      }
736    }
737    else
738    {
739      /* rebalance right branch */
740      switch (p->avl.bal)
741      {
742        case +1:
743          p->avl.bal = 0;
744          break;
745
746        case  0:
747          p->avl.bal = -1;
748          modified = false;
749          break;
750
751        case -1:
752          p1 = p->avl.left;
753
754          if (p1->avl.bal <= 0) /* simple LL-turn */
755          {
756            p->avl.left = p1->avl.right;
757            p1->avl.right = p;
758            if (p1->avl.bal == 0)
759            {
760              p1->avl.bal = 1;
761              modified = false;
762            }
763            else
764            {
765              p->avl.bal = 0;
766              p1->avl.bal = 0;
767            }
768            p = p1;
769          }
770          else /* double LR-turn */
771          {
772            p2 = p1->avl.right;
773
774            p1->avl.right = p2->avl.left;
775            p2->avl.left = p1;
776            p->avl.left = p2->avl.right;
777            p2->avl.right = p;
778
779            if (p2->avl.bal == -1) p->avl.bal = 1; else p->avl.bal = 0;
780            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
781
782            p = p2;
783            p2->avl.bal = 0;
784          }
785          break;
786
787        default:
788          break;
789      }
790    }
791
792    if (buf_prev > buf_stack)
793    {
794      q = *(buf_prev - 1);
795
796      if (q->avl.cache == -1)
797      {
798        q->avl.left = p;
799      }
800      else
801      {
802        q->avl.right = p;
803      }
804    }
805    else
806    {
807      *root = p;
808      break;
809    }
810
811  }
812
813  return 0;
814}
815
816static void
817rtems_bdbuf_set_state (rtems_bdbuf_buffer *bd, rtems_bdbuf_buf_state state)
818{
819  bd->state = state;
820}
821
822static rtems_blkdev_bnum
823rtems_bdbuf_media_block (const rtems_disk_device *dd, rtems_blkdev_bnum block)
824{
825  if (dd->block_to_media_block_shift >= 0)
826    return block << dd->block_to_media_block_shift;
827  else
828    /*
829     * Change the block number for the block size to the block number for the media
830     * block size. We have to use 64bit maths. There is no short cut here.
831     */
832    return (rtems_blkdev_bnum)
833      ((((uint64_t) block) * dd->block_size) / dd->media_block_size);
834}
835
836/**
837 * Lock the mutex. A single task can nest calls.
838 *
839 * @param lock The mutex to lock.
840 * @param fatal_error_code The error code if the call fails.
841 */
842static void
843rtems_bdbuf_lock (rtems_id lock, uint32_t fatal_error_code)
844{
845  rtems_status_code sc = rtems_semaphore_obtain (lock,
846                                                 RTEMS_WAIT,
847                                                 RTEMS_NO_TIMEOUT);
848  if (sc != RTEMS_SUCCESSFUL)
849    rtems_fatal_error_occurred (fatal_error_code);
850}
851
852/**
853 * Unlock the mutex.
854 *
855 * @param lock The mutex to unlock.
856 * @param fatal_error_code The error code if the call fails.
857 */
858static void
859rtems_bdbuf_unlock (rtems_id lock, uint32_t fatal_error_code)
860{
861  rtems_status_code sc = rtems_semaphore_release (lock);
862  if (sc != RTEMS_SUCCESSFUL)
863    rtems_fatal_error_occurred (fatal_error_code);
864}
865
866/**
867 * Lock the cache. A single task can nest calls.
868 */
869static void
870rtems_bdbuf_lock_cache (void)
871{
872  rtems_bdbuf_lock (bdbuf_cache.lock, RTEMS_BLKDEV_FATAL_BDBUF_CACHE_LOCK);
873}
874
875/**
876 * Unlock the cache.
877 */
878static void
879rtems_bdbuf_unlock_cache (void)
880{
881  rtems_bdbuf_unlock (bdbuf_cache.lock, RTEMS_BLKDEV_FATAL_BDBUF_CACHE_UNLOCK);
882}
883
884/**
885 * Lock the cache's sync. A single task can nest calls.
886 */
887static void
888rtems_bdbuf_lock_sync (void)
889{
890  rtems_bdbuf_lock (bdbuf_cache.sync_lock, RTEMS_BLKDEV_FATAL_BDBUF_SYNC_LOCK);
891}
892
893/**
894 * Unlock the cache's sync lock. Any blocked writers are woken.
895 */
896static void
897rtems_bdbuf_unlock_sync (void)
898{
899  rtems_bdbuf_unlock (bdbuf_cache.sync_lock,
900                      RTEMS_BLKDEV_FATAL_BDBUF_SYNC_UNLOCK);
901}
902
903static void
904rtems_bdbuf_group_obtain (rtems_bdbuf_buffer *bd)
905{
906  ++bd->group->users;
907}
908
909static void
910rtems_bdbuf_group_release (rtems_bdbuf_buffer *bd)
911{
912  --bd->group->users;
913}
914
915static rtems_mode
916rtems_bdbuf_disable_preemption (void)
917{
918  rtems_status_code sc = RTEMS_SUCCESSFUL;
919  rtems_mode prev_mode = 0;
920
921  sc = rtems_task_mode (RTEMS_NO_PREEMPT, RTEMS_PREEMPT_MASK, &prev_mode);
922  if (sc != RTEMS_SUCCESSFUL)
923    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_PREEMPT_DIS);
924
925  return prev_mode;
926}
927
928static void
929rtems_bdbuf_restore_preemption (rtems_mode prev_mode)
930{
931  rtems_status_code sc = RTEMS_SUCCESSFUL;
932
933  sc = rtems_task_mode (prev_mode, RTEMS_ALL_MODE_MASKS, &prev_mode);
934  if (sc != RTEMS_SUCCESSFUL)
935    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_PREEMPT_RST);
936}
937
938/**
939 * Wait until woken. Semaphores are used so a number of tasks can wait and can
940 * be woken at once. Task events would require we maintain a list of tasks to
941 * be woken and this would require storage and we do not know the number of
942 * tasks that could be waiting.
943 *
944 * While we have the cache locked we can try and claim the semaphore and
945 * therefore know when we release the lock to the cache we will block until the
946 * semaphore is released. This may even happen before we get to block.
947 *
948 * A counter is used to save the release call when no one is waiting.
949 *
950 * The function assumes the cache is locked on entry and it will be locked on
951 * exit.
952 */
953static void
954rtems_bdbuf_anonymous_wait (rtems_bdbuf_waiters *waiters)
955{
956  rtems_status_code sc;
957  rtems_mode        prev_mode;
958
959  /*
960   * Indicate we are waiting.
961   */
962  ++waiters->count;
963
964  /*
965   * Disable preemption then unlock the cache and block.  There is no POSIX
966   * condition variable in the core API so this is a work around.
967   *
968   * The issue is a task could preempt after the cache is unlocked because it is
969   * blocking or just hits that window, and before this task has blocked on the
970   * semaphore. If the preempting task flushes the queue this task will not see
971   * the flush and may block for ever or until another transaction flushes this
972   * semaphore.
973   */
974  prev_mode = rtems_bdbuf_disable_preemption ();
975
976  /*
977   * Unlock the cache, wait, and lock the cache when we return.
978   */
979  rtems_bdbuf_unlock_cache ();
980
981  sc = rtems_semaphore_obtain (waiters->sema, RTEMS_WAIT, RTEMS_BDBUF_WAIT_TIMEOUT);
982
983  if (sc == RTEMS_TIMEOUT)
984    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAIT_TO);
985
986  if (sc != RTEMS_UNSATISFIED)
987    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAIT_2);
988
989  rtems_bdbuf_lock_cache ();
990
991  rtems_bdbuf_restore_preemption (prev_mode);
992
993  --waiters->count;
994}
995
996static void
997rtems_bdbuf_wait (rtems_bdbuf_buffer *bd, rtems_bdbuf_waiters *waiters)
998{
999  rtems_bdbuf_group_obtain (bd);
1000  ++bd->waiters;
1001  rtems_bdbuf_anonymous_wait (waiters);
1002  --bd->waiters;
1003  rtems_bdbuf_group_release (bd);
1004}
1005
1006/**
1007 * Wake a blocked resource. The resource has a counter that lets us know if
1008 * there are any waiters.
1009 */
1010static void
1011rtems_bdbuf_wake (const rtems_bdbuf_waiters *waiters)
1012{
1013  rtems_status_code sc = RTEMS_SUCCESSFUL;
1014
1015  if (waiters->count > 0)
1016  {
1017    sc = rtems_semaphore_flush (waiters->sema);
1018    if (sc != RTEMS_SUCCESSFUL)
1019      rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_CACHE_WAKE);
1020  }
1021}
1022
1023static void
1024rtems_bdbuf_wake_swapper (void)
1025{
1026  rtems_status_code sc = rtems_event_send (bdbuf_cache.swapout,
1027                                           RTEMS_BDBUF_SWAPOUT_SYNC);
1028  if (sc != RTEMS_SUCCESSFUL)
1029    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_WAKE);
1030}
1031
1032static bool
1033rtems_bdbuf_has_buffer_waiters (void)
1034{
1035  return bdbuf_cache.buffer_waiters.count;
1036}
1037
1038static void
1039rtems_bdbuf_remove_from_tree (rtems_bdbuf_buffer *bd)
1040{
1041  if (rtems_bdbuf_avl_remove (&bdbuf_cache.tree, bd) != 0)
1042    rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_TREE_RM);
1043}
1044
1045static void
1046rtems_bdbuf_remove_from_tree_and_lru_list (rtems_bdbuf_buffer *bd)
1047{
1048  switch (bd->state)
1049  {
1050    case RTEMS_BDBUF_STATE_FREE:
1051      break;
1052    case RTEMS_BDBUF_STATE_CACHED:
1053      rtems_bdbuf_remove_from_tree (bd);
1054      break;
1055    default:
1056      rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_10);
1057  }
1058
1059  rtems_chain_extract_unprotected (&bd->link);
1060}
1061
1062static void
1063rtems_bdbuf_make_free_and_add_to_lru_list (rtems_bdbuf_buffer *bd)
1064{
1065  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_FREE);
1066  rtems_chain_prepend_unprotected (&bdbuf_cache.lru, &bd->link);
1067}
1068
1069static void
1070rtems_bdbuf_make_empty (rtems_bdbuf_buffer *bd)
1071{
1072  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_EMPTY);
1073}
1074
1075static void
1076rtems_bdbuf_make_cached_and_add_to_lru_list (rtems_bdbuf_buffer *bd)
1077{
1078  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_CACHED);
1079  rtems_chain_append_unprotected (&bdbuf_cache.lru, &bd->link);
1080}
1081
1082static void
1083rtems_bdbuf_discard_buffer (rtems_bdbuf_buffer *bd)
1084{
1085  rtems_bdbuf_make_empty (bd);
1086
1087  if (bd->waiters == 0)
1088  {
1089    rtems_bdbuf_remove_from_tree (bd);
1090    rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1091  }
1092}
1093
1094static void
1095rtems_bdbuf_add_to_modified_list_after_access (rtems_bdbuf_buffer *bd)
1096{
1097  if (bdbuf_cache.sync_active && bdbuf_cache.sync_device == bd->dd)
1098  {
1099    rtems_bdbuf_unlock_cache ();
1100
1101    /*
1102     * Wait for the sync lock.
1103     */
1104    rtems_bdbuf_lock_sync ();
1105
1106    rtems_bdbuf_unlock_sync ();
1107    rtems_bdbuf_lock_cache ();
1108  }
1109
1110  /*
1111   * Only the first modified release sets the timer and any further user
1112   * accesses do not change the timer value which should move down. This
1113   * assumes the user's hold of the buffer is much less than the time on the
1114   * modified list. Resetting the timer on each access which could result in a
1115   * buffer never getting to 0 and never being forced onto disk. This raises a
1116   * difficult question. Is a snapshot of a block that is changing better than
1117   * nothing being written? We have tended to think we should hold changes for
1118   * only a specific period of time even if still changing and get onto disk
1119   * and letting the file system try and recover this position if it can.
1120   */
1121  if (bd->state == RTEMS_BDBUF_STATE_ACCESS_CACHED
1122        || bd->state == RTEMS_BDBUF_STATE_ACCESS_EMPTY)
1123    bd->hold_timer = bdbuf_config.swap_block_hold;
1124
1125  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_MODIFIED);
1126  rtems_chain_append_unprotected (&bdbuf_cache.modified, &bd->link);
1127
1128  if (bd->waiters)
1129    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1130  else if (rtems_bdbuf_has_buffer_waiters ())
1131    rtems_bdbuf_wake_swapper ();
1132}
1133
1134static void
1135rtems_bdbuf_add_to_lru_list_after_access (rtems_bdbuf_buffer *bd)
1136{
1137  rtems_bdbuf_group_release (bd);
1138  rtems_bdbuf_make_cached_and_add_to_lru_list (bd);
1139
1140  if (bd->waiters)
1141    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1142  else
1143    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1144}
1145
1146/**
1147 * Compute the number of BDs per group for a given buffer size.
1148 *
1149 * @param size The buffer size. It can be any size and we scale up.
1150 */
1151static size_t
1152rtems_bdbuf_bds_per_group (size_t size)
1153{
1154  size_t bufs_per_size;
1155  size_t bds_per_size;
1156
1157  if (size > bdbuf_config.buffer_max)
1158    return 0;
1159
1160  bufs_per_size = ((size - 1) / bdbuf_config.buffer_min) + 1;
1161
1162  for (bds_per_size = 1;
1163       bds_per_size < bufs_per_size;
1164       bds_per_size <<= 1)
1165    ;
1166
1167  return bdbuf_cache.max_bds_per_group / bds_per_size;
1168}
1169
1170static void
1171rtems_bdbuf_discard_buffer_after_access (rtems_bdbuf_buffer *bd)
1172{
1173  rtems_bdbuf_group_release (bd);
1174  rtems_bdbuf_discard_buffer (bd);
1175
1176  if (bd->waiters)
1177    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1178  else
1179    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1180}
1181
1182/**
1183 * Reallocate a group. The BDs currently allocated in the group are removed
1184 * from the ALV tree and any lists then the new BD's are prepended to the ready
1185 * list of the cache.
1186 *
1187 * @param group The group to reallocate.
1188 * @param new_bds_per_group The new count of BDs per group.
1189 * @return A buffer of this group.
1190 */
1191static rtems_bdbuf_buffer *
1192rtems_bdbuf_group_realloc (rtems_bdbuf_group* group, size_t new_bds_per_group)
1193{
1194  rtems_bdbuf_buffer* bd;
1195  size_t              b;
1196  size_t              bufs_per_bd;
1197
1198  if (rtems_bdbuf_tracer)
1199    printf ("bdbuf:realloc: %tu: %zd -> %zd\n",
1200            group - bdbuf_cache.groups, group->bds_per_group,
1201            new_bds_per_group);
1202
1203  bufs_per_bd = bdbuf_cache.max_bds_per_group / group->bds_per_group;
1204
1205  for (b = 0, bd = group->bdbuf;
1206       b < group->bds_per_group;
1207       b++, bd += bufs_per_bd)
1208    rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1209
1210  group->bds_per_group = new_bds_per_group;
1211  bufs_per_bd = bdbuf_cache.max_bds_per_group / new_bds_per_group;
1212
1213  for (b = 1, bd = group->bdbuf + bufs_per_bd;
1214       b < group->bds_per_group;
1215       b++, bd += bufs_per_bd)
1216    rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1217
1218  if (b > 1)
1219    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1220
1221  return group->bdbuf;
1222}
1223
1224static void
1225rtems_bdbuf_setup_empty_buffer (rtems_bdbuf_buffer *bd,
1226                                rtems_disk_device  *dd,
1227                                rtems_blkdev_bnum   block)
1228{
1229  bd->dd        = dd ;
1230  bd->block     = block;
1231  bd->avl.left  = NULL;
1232  bd->avl.right = NULL;
1233  bd->waiters   = 0;
1234
1235  if (rtems_bdbuf_avl_insert (&bdbuf_cache.tree, bd) != 0)
1236    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_RECYCLE);
1237
1238  rtems_bdbuf_make_empty (bd);
1239}
1240
1241static rtems_bdbuf_buffer *
1242rtems_bdbuf_get_buffer_from_lru_list (rtems_disk_device *dd,
1243                                      rtems_blkdev_bnum  block)
1244{
1245  rtems_chain_node *node = rtems_chain_first (&bdbuf_cache.lru);
1246
1247  while (!rtems_chain_is_tail (&bdbuf_cache.lru, node))
1248  {
1249    rtems_bdbuf_buffer *bd = (rtems_bdbuf_buffer *) node;
1250    rtems_bdbuf_buffer *empty_bd = NULL;
1251
1252    if (rtems_bdbuf_tracer)
1253      printf ("bdbuf:next-bd: %tu (%td:%" PRId32 ") %zd -> %zd\n",
1254              bd - bdbuf_cache.bds,
1255              bd->group - bdbuf_cache.groups, bd->group->users,
1256              bd->group->bds_per_group, dd->bds_per_group);
1257
1258    /*
1259     * If nobody waits for this BD, we may recycle it.
1260     */
1261    if (bd->waiters == 0)
1262    {
1263      if (bd->group->bds_per_group == dd->bds_per_group)
1264      {
1265        rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1266
1267        empty_bd = bd;
1268      }
1269      else if (bd->group->users == 0)
1270        empty_bd = rtems_bdbuf_group_realloc (bd->group, dd->bds_per_group);
1271    }
1272
1273    if (empty_bd != NULL)
1274    {
1275      rtems_bdbuf_setup_empty_buffer (empty_bd, dd, block);
1276
1277      return empty_bd;
1278    }
1279
1280    node = rtems_chain_next (node);
1281  }
1282
1283  return NULL;
1284}
1285
1286static rtems_status_code
1287rtems_bdbuf_create_task(
1288  rtems_name name,
1289  rtems_task_priority priority,
1290  rtems_task_priority default_priority,
1291  rtems_task_entry entry,
1292  rtems_task_argument arg,
1293  rtems_id *id
1294)
1295{
1296  rtems_status_code sc;
1297  size_t stack_size = bdbuf_config.task_stack_size ?
1298    bdbuf_config.task_stack_size : RTEMS_BDBUF_TASK_STACK_SIZE_DEFAULT;
1299
1300  priority = priority != 0 ? priority : default_priority;
1301
1302  sc = rtems_task_create (name,
1303                          priority,
1304                          stack_size,
1305                          RTEMS_PREEMPT | RTEMS_NO_TIMESLICE | RTEMS_NO_ASR,
1306                          RTEMS_LOCAL | RTEMS_NO_FLOATING_POINT,
1307                          id);
1308
1309  if (sc == RTEMS_SUCCESSFUL)
1310    sc = rtems_task_start (*id, entry, arg);
1311
1312  return sc;
1313}
1314
1315/**
1316 * Initialise the cache.
1317 *
1318 * @return rtems_status_code The initialisation status.
1319 */
1320rtems_status_code
1321rtems_bdbuf_init (void)
1322{
1323  rtems_bdbuf_group*  group;
1324  rtems_bdbuf_buffer* bd;
1325  uint8_t*            buffer;
1326  size_t              b;
1327  size_t              cache_aligment;
1328  rtems_status_code   sc;
1329  rtems_mode          prev_mode;
1330
1331  if (rtems_bdbuf_tracer)
1332    printf ("bdbuf:init\n");
1333
1334  if (rtems_interrupt_is_in_progress())
1335    return RTEMS_CALLED_FROM_ISR;
1336
1337  /*
1338   * Check the configuration table values.
1339   */
1340  if ((bdbuf_config.buffer_max % bdbuf_config.buffer_min) != 0)
1341    return RTEMS_INVALID_NUMBER;
1342
1343  /*
1344   * We use a special variable to manage the initialisation incase we have
1345   * completing threads doing this. You may get errors if the another thread
1346   * makes a call and we have not finished initialisation.
1347   */
1348  prev_mode = rtems_bdbuf_disable_preemption ();
1349  if (bdbuf_cache.initialised)
1350  {
1351    rtems_bdbuf_restore_preemption (prev_mode);
1352    return RTEMS_RESOURCE_IN_USE;
1353  }
1354
1355  memset(&bdbuf_cache, 0, sizeof(bdbuf_cache));
1356  bdbuf_cache.initialised = true;
1357  rtems_bdbuf_restore_preemption (prev_mode);
1358
1359  /*
1360   * For unspecified cache alignments we use the CPU alignment.
1361   */
1362  cache_aligment = 32; /* FIXME rtems_cache_get_data_line_size() */
1363  if (cache_aligment <= 0)
1364    cache_aligment = CPU_ALIGNMENT;
1365
1366  bdbuf_cache.sync_device = BDBUF_INVALID_DEV;
1367
1368  rtems_chain_initialize_empty (&bdbuf_cache.swapout_workers);
1369  rtems_chain_initialize_empty (&bdbuf_cache.lru);
1370  rtems_chain_initialize_empty (&bdbuf_cache.modified);
1371  rtems_chain_initialize_empty (&bdbuf_cache.sync);
1372  rtems_chain_initialize_empty (&bdbuf_cache.read_ahead_chain);
1373
1374  /*
1375   * Create the locks for the cache.
1376   */
1377  sc = rtems_semaphore_create (rtems_build_name ('B', 'D', 'C', 'l'),
1378                               1, RTEMS_BDBUF_CACHE_LOCK_ATTRIBS, 0,
1379                               &bdbuf_cache.lock);
1380  if (sc != RTEMS_SUCCESSFUL)
1381    goto error;
1382
1383  rtems_bdbuf_lock_cache ();
1384
1385  sc = rtems_semaphore_create (rtems_build_name ('B', 'D', 'C', 's'),
1386                               1, RTEMS_BDBUF_CACHE_LOCK_ATTRIBS, 0,
1387                               &bdbuf_cache.sync_lock);
1388  if (sc != RTEMS_SUCCESSFUL)
1389    goto error;
1390
1391  sc = rtems_semaphore_create (rtems_build_name ('B', 'D', 'C', 'a'),
1392                               0, RTEMS_BDBUF_CACHE_WAITER_ATTRIBS, 0,
1393                               &bdbuf_cache.access_waiters.sema);
1394  if (sc != RTEMS_SUCCESSFUL)
1395    goto error;
1396
1397  sc = rtems_semaphore_create (rtems_build_name ('B', 'D', 'C', 't'),
1398                               0, RTEMS_BDBUF_CACHE_WAITER_ATTRIBS, 0,
1399                               &bdbuf_cache.transfer_waiters.sema);
1400  if (sc != RTEMS_SUCCESSFUL)
1401    goto error;
1402
1403  sc = rtems_semaphore_create (rtems_build_name ('B', 'D', 'C', 'b'),
1404                               0, RTEMS_BDBUF_CACHE_WAITER_ATTRIBS, 0,
1405                               &bdbuf_cache.buffer_waiters.sema);
1406  if (sc != RTEMS_SUCCESSFUL)
1407    goto error;
1408
1409  /*
1410   * Compute the various number of elements in the cache.
1411   */
1412  bdbuf_cache.buffer_min_count =
1413    bdbuf_config.size / bdbuf_config.buffer_min;
1414  bdbuf_cache.max_bds_per_group =
1415    bdbuf_config.buffer_max / bdbuf_config.buffer_min;
1416  bdbuf_cache.group_count =
1417    bdbuf_cache.buffer_min_count / bdbuf_cache.max_bds_per_group;
1418
1419  /*
1420   * Allocate the memory for the buffer descriptors.
1421   */
1422  bdbuf_cache.bds = calloc (sizeof (rtems_bdbuf_buffer),
1423                            bdbuf_cache.buffer_min_count);
1424  if (!bdbuf_cache.bds)
1425    goto error;
1426
1427  /*
1428   * Allocate the memory for the buffer descriptors.
1429   */
1430  bdbuf_cache.groups = calloc (sizeof (rtems_bdbuf_group),
1431                               bdbuf_cache.group_count);
1432  if (!bdbuf_cache.groups)
1433    goto error;
1434
1435  /*
1436   * Allocate memory for buffer memory. The buffer memory will be cache
1437   * aligned. It is possible to free the memory allocated by rtems_memalign()
1438   * with free(). Return 0 if allocated.
1439   *
1440   * The memory allocate allows a
1441   */
1442  if (rtems_memalign ((void **) &bdbuf_cache.buffers,
1443                      cache_aligment,
1444                      bdbuf_cache.buffer_min_count * bdbuf_config.buffer_min) != 0)
1445    goto error;
1446
1447  /*
1448   * The cache is empty after opening so we need to add all the buffers to it
1449   * and initialise the groups.
1450   */
1451  for (b = 0, group = bdbuf_cache.groups,
1452         bd = bdbuf_cache.bds, buffer = bdbuf_cache.buffers;
1453       b < bdbuf_cache.buffer_min_count;
1454       b++, bd++, buffer += bdbuf_config.buffer_min)
1455  {
1456    bd->dd    = BDBUF_INVALID_DEV;
1457    bd->group  = group;
1458    bd->buffer = buffer;
1459
1460    rtems_chain_append_unprotected (&bdbuf_cache.lru, &bd->link);
1461
1462    if ((b % bdbuf_cache.max_bds_per_group) ==
1463        (bdbuf_cache.max_bds_per_group - 1))
1464      group++;
1465  }
1466
1467  for (b = 0,
1468         group = bdbuf_cache.groups,
1469         bd = bdbuf_cache.bds;
1470       b < bdbuf_cache.group_count;
1471       b++,
1472         group++,
1473         bd += bdbuf_cache.max_bds_per_group)
1474  {
1475    group->bds_per_group = bdbuf_cache.max_bds_per_group;
1476    group->bdbuf = bd;
1477  }
1478
1479  /*
1480   * Create and start swapout task. This task will create and manage the worker
1481   * threads.
1482   */
1483  bdbuf_cache.swapout_enabled = true;
1484
1485  sc = rtems_bdbuf_create_task (rtems_build_name('B', 'S', 'W', 'P'),
1486                                bdbuf_config.swapout_priority,
1487                                RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT,
1488                                rtems_bdbuf_swapout_task,
1489                                0,
1490                                &bdbuf_cache.swapout);
1491  if (sc != RTEMS_SUCCESSFUL)
1492    goto error;
1493
1494  if (bdbuf_config.max_read_ahead_blocks > 0)
1495  {
1496    bdbuf_cache.read_ahead_enabled = true;
1497    sc = rtems_bdbuf_create_task (rtems_build_name('B', 'R', 'D', 'A'),
1498                                  bdbuf_config.read_ahead_priority,
1499                                  RTEMS_BDBUF_READ_AHEAD_TASK_PRIORITY_DEFAULT,
1500                                  rtems_bdbuf_read_ahead_task,
1501                                  0,
1502                                  &bdbuf_cache.read_ahead_task);
1503    if (sc != RTEMS_SUCCESSFUL)
1504      goto error;
1505  }
1506
1507  rtems_bdbuf_unlock_cache ();
1508
1509  return RTEMS_SUCCESSFUL;
1510
1511error:
1512
1513  if (bdbuf_cache.read_ahead_task != 0)
1514    rtems_task_delete (bdbuf_cache.read_ahead_task);
1515
1516  if (bdbuf_cache.swapout != 0)
1517    rtems_task_delete (bdbuf_cache.swapout);
1518
1519  free (bdbuf_cache.buffers);
1520  free (bdbuf_cache.groups);
1521  free (bdbuf_cache.bds);
1522
1523  rtems_semaphore_delete (bdbuf_cache.buffer_waiters.sema);
1524  rtems_semaphore_delete (bdbuf_cache.access_waiters.sema);
1525  rtems_semaphore_delete (bdbuf_cache.transfer_waiters.sema);
1526  rtems_semaphore_delete (bdbuf_cache.sync_lock);
1527
1528  if (bdbuf_cache.lock != 0)
1529  {
1530    rtems_bdbuf_unlock_cache ();
1531    rtems_semaphore_delete (bdbuf_cache.lock);
1532  }
1533
1534  bdbuf_cache.initialised = false;
1535
1536  return RTEMS_UNSATISFIED;
1537}
1538
1539static void
1540rtems_bdbuf_wait_for_event (rtems_event_set event)
1541{
1542  rtems_status_code sc = RTEMS_SUCCESSFUL;
1543  rtems_event_set   out = 0;
1544
1545  sc = rtems_event_receive (event,
1546                            RTEMS_EVENT_ALL | RTEMS_WAIT,
1547                            RTEMS_NO_TIMEOUT,
1548                            &out);
1549
1550  if (sc != RTEMS_SUCCESSFUL || out != event)
1551    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_WAIT_EVNT);
1552}
1553
1554static void
1555rtems_bdbuf_wait_for_access (rtems_bdbuf_buffer *bd)
1556{
1557  while (true)
1558  {
1559    switch (bd->state)
1560    {
1561      case RTEMS_BDBUF_STATE_MODIFIED:
1562        rtems_bdbuf_group_release (bd);
1563        /* Fall through */
1564      case RTEMS_BDBUF_STATE_CACHED:
1565        rtems_chain_extract_unprotected (&bd->link);
1566        /* Fall through */
1567      case RTEMS_BDBUF_STATE_EMPTY:
1568        return;
1569      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1570      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1571      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1572      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1573        rtems_bdbuf_wait (bd, &bdbuf_cache.access_waiters);
1574        break;
1575      case RTEMS_BDBUF_STATE_SYNC:
1576      case RTEMS_BDBUF_STATE_TRANSFER:
1577      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1578        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1579        break;
1580      default:
1581        rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_7);
1582    }
1583  }
1584}
1585
1586static void
1587rtems_bdbuf_request_sync_for_modified_buffer (rtems_bdbuf_buffer *bd)
1588{
1589  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_SYNC);
1590  rtems_chain_extract_unprotected (&bd->link);
1591  rtems_chain_append_unprotected (&bdbuf_cache.sync, &bd->link);
1592  rtems_bdbuf_wake_swapper ();
1593}
1594
1595/**
1596 * @brief Waits until the buffer is ready for recycling.
1597 *
1598 * @retval @c true Buffer is valid and may be recycled.
1599 * @retval @c false Buffer is invalid and has to searched again.
1600 */
1601static bool
1602rtems_bdbuf_wait_for_recycle (rtems_bdbuf_buffer *bd)
1603{
1604  while (true)
1605  {
1606    switch (bd->state)
1607    {
1608      case RTEMS_BDBUF_STATE_FREE:
1609        return true;
1610      case RTEMS_BDBUF_STATE_MODIFIED:
1611        rtems_bdbuf_request_sync_for_modified_buffer (bd);
1612        break;
1613      case RTEMS_BDBUF_STATE_CACHED:
1614      case RTEMS_BDBUF_STATE_EMPTY:
1615        if (bd->waiters == 0)
1616          return true;
1617        else
1618        {
1619          /*
1620           * It is essential that we wait here without a special wait count and
1621           * without the group in use.  Otherwise we could trigger a wait ping
1622           * pong with another recycle waiter.  The state of the buffer is
1623           * arbitrary afterwards.
1624           */
1625          rtems_bdbuf_anonymous_wait (&bdbuf_cache.buffer_waiters);
1626          return false;
1627        }
1628      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1629      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1630      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1631      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1632        rtems_bdbuf_wait (bd, &bdbuf_cache.access_waiters);
1633        break;
1634      case RTEMS_BDBUF_STATE_SYNC:
1635      case RTEMS_BDBUF_STATE_TRANSFER:
1636      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1637        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1638        break;
1639      default:
1640        rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_8);
1641    }
1642  }
1643}
1644
1645static void
1646rtems_bdbuf_wait_for_sync_done (rtems_bdbuf_buffer *bd)
1647{
1648  while (true)
1649  {
1650    switch (bd->state)
1651    {
1652      case RTEMS_BDBUF_STATE_CACHED:
1653      case RTEMS_BDBUF_STATE_EMPTY:
1654      case RTEMS_BDBUF_STATE_MODIFIED:
1655      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1656      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1657      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1658      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1659        return;
1660      case RTEMS_BDBUF_STATE_SYNC:
1661      case RTEMS_BDBUF_STATE_TRANSFER:
1662      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1663        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1664        break;
1665      default:
1666        rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_9);
1667    }
1668  }
1669}
1670
1671static void
1672rtems_bdbuf_wait_for_buffer (void)
1673{
1674  if (!rtems_chain_is_empty (&bdbuf_cache.modified))
1675    rtems_bdbuf_wake_swapper ();
1676
1677  rtems_bdbuf_anonymous_wait (&bdbuf_cache.buffer_waiters);
1678}
1679
1680static void
1681rtems_bdbuf_sync_after_access (rtems_bdbuf_buffer *bd)
1682{
1683  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_SYNC);
1684
1685  rtems_chain_append_unprotected (&bdbuf_cache.sync, &bd->link);
1686
1687  if (bd->waiters)
1688    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1689
1690  rtems_bdbuf_wake_swapper ();
1691  rtems_bdbuf_wait_for_sync_done (bd);
1692
1693  /*
1694   * We may have created a cached or empty buffer which may be recycled.
1695   */
1696  if (bd->waiters == 0
1697        && (bd->state == RTEMS_BDBUF_STATE_CACHED
1698          || bd->state == RTEMS_BDBUF_STATE_EMPTY))
1699  {
1700    if (bd->state == RTEMS_BDBUF_STATE_EMPTY)
1701    {
1702      rtems_bdbuf_remove_from_tree (bd);
1703      rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1704    }
1705    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1706  }
1707}
1708
1709static rtems_bdbuf_buffer *
1710rtems_bdbuf_get_buffer_for_read_ahead (rtems_disk_device *dd,
1711                                       rtems_blkdev_bnum  block)
1712{
1713  rtems_bdbuf_buffer *bd = NULL;
1714
1715  bd = rtems_bdbuf_avl_search (&bdbuf_cache.tree, dd, block);
1716
1717  if (bd == NULL)
1718  {
1719    bd = rtems_bdbuf_get_buffer_from_lru_list (dd, block);
1720
1721    if (bd != NULL)
1722      rtems_bdbuf_group_obtain (bd);
1723  }
1724  else
1725    /*
1726     * The buffer is in the cache.  So it is already available or in use, and
1727     * thus no need for a read ahead.
1728     */
1729    bd = NULL;
1730
1731  return bd;
1732}
1733
1734static rtems_bdbuf_buffer *
1735rtems_bdbuf_get_buffer_for_access (rtems_disk_device *dd,
1736                                   rtems_blkdev_bnum  block)
1737{
1738  rtems_bdbuf_buffer *bd = NULL;
1739
1740  do
1741  {
1742    bd = rtems_bdbuf_avl_search (&bdbuf_cache.tree, dd, block);
1743
1744    if (bd != NULL)
1745    {
1746      if (bd->group->bds_per_group != dd->bds_per_group)
1747      {
1748        if (rtems_bdbuf_wait_for_recycle (bd))
1749        {
1750          rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1751          rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1752          rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1753        }
1754        bd = NULL;
1755      }
1756    }
1757    else
1758    {
1759      bd = rtems_bdbuf_get_buffer_from_lru_list (dd, block);
1760
1761      if (bd == NULL)
1762        rtems_bdbuf_wait_for_buffer ();
1763    }
1764  }
1765  while (bd == NULL);
1766
1767  rtems_bdbuf_wait_for_access (bd);
1768  rtems_bdbuf_group_obtain (bd);
1769
1770  return bd;
1771}
1772
1773static rtems_status_code
1774rtems_bdbuf_get_media_block (const rtems_disk_device *dd,
1775                             rtems_blkdev_bnum        block,
1776                             rtems_blkdev_bnum       *media_block_ptr)
1777{
1778  rtems_status_code sc = RTEMS_SUCCESSFUL;
1779
1780  if (block < dd->block_count)
1781  {
1782    /*
1783     * Compute the media block number. Drivers work with media block number not
1784     * the block number a BD may have as this depends on the block size set by
1785     * the user.
1786     */
1787    *media_block_ptr = rtems_bdbuf_media_block (dd, block) + dd->start;
1788  }
1789  else
1790  {
1791    sc = RTEMS_INVALID_ID;
1792  }
1793
1794  return sc;
1795}
1796
1797rtems_status_code
1798rtems_bdbuf_get (rtems_disk_device   *dd,
1799                 rtems_blkdev_bnum    block,
1800                 rtems_bdbuf_buffer **bd_ptr)
1801{
1802  rtems_status_code   sc = RTEMS_SUCCESSFUL;
1803  rtems_bdbuf_buffer *bd = NULL;
1804  rtems_blkdev_bnum   media_block;
1805
1806  rtems_bdbuf_lock_cache ();
1807
1808  sc = rtems_bdbuf_get_media_block (dd, block, &media_block);
1809  if (sc == RTEMS_SUCCESSFUL)
1810  {
1811    /*
1812     * Print the block index relative to the physical disk.
1813     */
1814    if (rtems_bdbuf_tracer)
1815      printf ("bdbuf:get: %" PRIu32 " (%" PRIu32 ") (dev = %08x)\n",
1816              media_block, block, (unsigned) dd->dev);
1817
1818    bd = rtems_bdbuf_get_buffer_for_access (dd, media_block);
1819
1820    switch (bd->state)
1821    {
1822      case RTEMS_BDBUF_STATE_CACHED:
1823        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
1824        break;
1825      case RTEMS_BDBUF_STATE_EMPTY:
1826        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_EMPTY);
1827        break;
1828      case RTEMS_BDBUF_STATE_MODIFIED:
1829        /*
1830         * To get a modified buffer could be considered a bug in the caller
1831         * because you should not be getting an already modified buffer but
1832         * user may have modified a byte in a block then decided to seek the
1833         * start and write the whole block and the file system will have no
1834         * record of this so just gets the block to fill.
1835         */
1836        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_MODIFIED);
1837        break;
1838      default:
1839        rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_2);
1840        break;
1841    }
1842
1843    if (rtems_bdbuf_tracer)
1844    {
1845      rtems_bdbuf_show_users ("get", bd);
1846      rtems_bdbuf_show_usage ();
1847    }
1848  }
1849
1850  rtems_bdbuf_unlock_cache ();
1851
1852  *bd_ptr = bd;
1853
1854  return sc;
1855}
1856
1857/**
1858 * Call back handler called by the low level driver when the transfer has
1859 * completed. This function may be invoked from interrupt handler.
1860 *
1861 * @param arg Arbitrary argument specified in block device request
1862 *            structure (in this case - pointer to the appropriate
1863 *            block device request structure).
1864 * @param status I/O completion status
1865 */
1866static void
1867rtems_bdbuf_transfer_done (void* arg, rtems_status_code status)
1868{
1869  rtems_blkdev_request* req = (rtems_blkdev_request*) arg;
1870
1871  req->status = status;
1872
1873  rtems_event_send (req->io_task, RTEMS_BDBUF_TRANSFER_SYNC);
1874}
1875
1876static rtems_status_code
1877rtems_bdbuf_execute_transfer_request (rtems_disk_device    *dd,
1878                                      rtems_blkdev_request *req,
1879                                      bool                  cache_locked)
1880{
1881  rtems_status_code sc = RTEMS_SUCCESSFUL;
1882  int result = 0;
1883  uint32_t transfer_index = 0;
1884  bool wake_transfer_waiters = false;
1885  bool wake_buffer_waiters = false;
1886
1887  if (cache_locked)
1888    rtems_bdbuf_unlock_cache ();
1889
1890  result = dd->ioctl (dd->phys_dev, RTEMS_BLKIO_REQUEST, req);
1891
1892  if (result == 0)
1893  {
1894    rtems_bdbuf_wait_for_event (RTEMS_BDBUF_TRANSFER_SYNC);
1895    sc = req->status;
1896  }
1897  else
1898    sc = RTEMS_IO_ERROR;
1899
1900  rtems_bdbuf_lock_cache ();
1901
1902  /* Statistics */
1903  if (req->req == RTEMS_BLKDEV_REQ_READ)
1904  {
1905    dd->stats.read_blocks += req->bufnum;
1906    if (sc != RTEMS_SUCCESSFUL)
1907      ++dd->stats.read_errors;
1908  }
1909  else
1910  {
1911    dd->stats.write_blocks += req->bufnum;
1912    ++dd->stats.write_transfers;
1913    if (sc != RTEMS_SUCCESSFUL)
1914      ++dd->stats.write_errors;
1915  }
1916
1917  for (transfer_index = 0; transfer_index < req->bufnum; ++transfer_index)
1918  {
1919    rtems_bdbuf_buffer *bd = req->bufs [transfer_index].user;
1920    bool waiters = bd->waiters;
1921
1922    if (waiters)
1923      wake_transfer_waiters = true;
1924    else
1925      wake_buffer_waiters = true;
1926
1927    rtems_bdbuf_group_release (bd);
1928
1929    if (sc == RTEMS_SUCCESSFUL && bd->state == RTEMS_BDBUF_STATE_TRANSFER)
1930      rtems_bdbuf_make_cached_and_add_to_lru_list (bd);
1931    else
1932      rtems_bdbuf_discard_buffer (bd);
1933
1934    if (rtems_bdbuf_tracer)
1935      rtems_bdbuf_show_users ("transfer", bd);
1936  }
1937
1938  if (wake_transfer_waiters)
1939    rtems_bdbuf_wake (&bdbuf_cache.transfer_waiters);
1940
1941  if (wake_buffer_waiters)
1942    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1943
1944  if (!cache_locked)
1945    rtems_bdbuf_unlock_cache ();
1946
1947  if (sc == RTEMS_SUCCESSFUL || sc == RTEMS_UNSATISFIED)
1948    return sc;
1949  else
1950    return RTEMS_IO_ERROR;
1951}
1952
1953static rtems_status_code
1954rtems_bdbuf_execute_read_request (rtems_disk_device  *dd,
1955                                  rtems_bdbuf_buffer *bd,
1956                                  uint32_t            transfer_count)
1957{
1958  rtems_blkdev_request *req = NULL;
1959  rtems_blkdev_bnum media_block = bd->block;
1960  uint32_t media_blocks_per_block = dd->media_blocks_per_block;
1961  uint32_t block_size = dd->block_size;
1962  uint32_t transfer_index = 1;
1963
1964  /*
1965   * TODO: This type of request structure is wrong and should be removed.
1966   */
1967#define bdbuf_alloc(size) __builtin_alloca (size)
1968
1969  req = bdbuf_alloc (sizeof (rtems_blkdev_request) +
1970                     sizeof (rtems_blkdev_sg_buffer) * transfer_count);
1971
1972  req->req = RTEMS_BLKDEV_REQ_READ;
1973  req->req_done = rtems_bdbuf_transfer_done;
1974  req->done_arg = req;
1975  req->io_task = rtems_task_self ();
1976  req->status = RTEMS_RESOURCE_IN_USE;
1977  req->bufnum = 0;
1978
1979  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
1980
1981  req->bufs [0].user   = bd;
1982  req->bufs [0].block  = media_block;
1983  req->bufs [0].length = block_size;
1984  req->bufs [0].buffer = bd->buffer;
1985
1986  if (rtems_bdbuf_tracer)
1987    rtems_bdbuf_show_users ("read", bd);
1988
1989  while (transfer_index < transfer_count)
1990  {
1991    media_block += media_blocks_per_block;
1992
1993    bd = rtems_bdbuf_get_buffer_for_read_ahead (dd, media_block);
1994
1995    if (bd == NULL)
1996      break;
1997
1998    rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
1999
2000    req->bufs [transfer_index].user   = bd;
2001    req->bufs [transfer_index].block  = media_block;
2002    req->bufs [transfer_index].length = block_size;
2003    req->bufs [transfer_index].buffer = bd->buffer;
2004
2005    if (rtems_bdbuf_tracer)
2006      rtems_bdbuf_show_users ("read", bd);
2007
2008    ++transfer_index;
2009  }
2010
2011  req->bufnum = transfer_index;
2012
2013  return rtems_bdbuf_execute_transfer_request (dd, req, true);
2014}
2015
2016static bool
2017rtems_bdbuf_is_read_ahead_active (const rtems_disk_device *dd)
2018{
2019  return !rtems_chain_is_node_off_chain (&dd->read_ahead.node);
2020}
2021
2022static void
2023rtems_bdbuf_read_ahead_cancel (rtems_disk_device *dd)
2024{
2025  if (rtems_bdbuf_is_read_ahead_active (dd))
2026  {
2027    rtems_chain_extract_unprotected (&dd->read_ahead.node);
2028    rtems_chain_set_off_chain (&dd->read_ahead.node);
2029  }
2030}
2031
2032static void
2033rtems_bdbuf_read_ahead_reset (rtems_disk_device *dd)
2034{
2035  rtems_bdbuf_read_ahead_cancel (dd);
2036  dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
2037}
2038
2039static void
2040rtems_bdbuf_check_read_ahead_trigger (rtems_disk_device *dd,
2041                                      rtems_blkdev_bnum  block)
2042{
2043  if (dd->read_ahead.trigger == block
2044      && !rtems_bdbuf_is_read_ahead_active (dd))
2045  {
2046    rtems_status_code sc;
2047    rtems_chain_control *chain = &bdbuf_cache.read_ahead_chain;
2048
2049    rtems_chain_append_unprotected (chain, &dd->read_ahead.node);
2050    sc = rtems_event_send (bdbuf_cache.read_ahead_task,
2051                           RTEMS_BDBUF_READ_AHEAD_WAKE_UP);
2052    if (sc != RTEMS_SUCCESSFUL)
2053      rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_RA_WAKE_UP);
2054  }
2055}
2056
2057static void
2058rtems_bdbuf_set_read_ahead_trigger (rtems_disk_device *dd,
2059                                    rtems_blkdev_bnum  block)
2060{
2061  if (dd->read_ahead.trigger != block)
2062  {
2063    rtems_bdbuf_read_ahead_cancel (dd);
2064    dd->read_ahead.trigger = block + 1;
2065    dd->read_ahead.next = block + 2;
2066  }
2067}
2068
2069rtems_status_code
2070rtems_bdbuf_read (rtems_disk_device   *dd,
2071                  rtems_blkdev_bnum    block,
2072                  rtems_bdbuf_buffer **bd_ptr)
2073{
2074  rtems_status_code     sc = RTEMS_SUCCESSFUL;
2075  rtems_bdbuf_buffer   *bd = NULL;
2076  rtems_blkdev_bnum     media_block;
2077
2078  rtems_bdbuf_lock_cache ();
2079
2080  sc = rtems_bdbuf_get_media_block (dd, block, &media_block);
2081  if (sc == RTEMS_SUCCESSFUL)
2082  {
2083    if (rtems_bdbuf_tracer)
2084      printf ("bdbuf:read: %" PRIu32 " (%" PRIu32 ") (dev = %08x)\n",
2085              media_block + dd->start, block, (unsigned) dd->dev);
2086
2087    rtems_bdbuf_check_read_ahead_trigger (dd, block);
2088    bd = rtems_bdbuf_get_buffer_for_access (dd, media_block);
2089    switch (bd->state)
2090    {
2091      case RTEMS_BDBUF_STATE_CACHED:
2092        ++dd->stats.read_hits;
2093        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
2094        break;
2095      case RTEMS_BDBUF_STATE_MODIFIED:
2096        ++dd->stats.read_hits;
2097        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_MODIFIED);
2098        break;
2099      case RTEMS_BDBUF_STATE_EMPTY:
2100        ++dd->stats.read_misses;
2101        rtems_bdbuf_set_read_ahead_trigger (dd, block);
2102        sc = rtems_bdbuf_execute_read_request (dd, bd, 1);
2103        if (sc == RTEMS_SUCCESSFUL)
2104        {
2105          rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
2106          rtems_chain_extract_unprotected (&bd->link);
2107          rtems_bdbuf_group_obtain (bd);
2108        }
2109        else
2110        {
2111          bd = NULL;
2112        }
2113        break;
2114      default:
2115        rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_4);
2116        break;
2117    }
2118  }
2119
2120  rtems_bdbuf_unlock_cache ();
2121
2122  *bd_ptr = bd;
2123
2124  return sc;
2125}
2126
2127static rtems_status_code
2128rtems_bdbuf_check_bd_and_lock_cache (rtems_bdbuf_buffer *bd, const char *kind)
2129{
2130  if (bd == NULL)
2131    return RTEMS_INVALID_ADDRESS;
2132  if (rtems_bdbuf_tracer)
2133  {
2134    printf ("bdbuf:%s: %" PRIu32 "\n", kind, bd->block);
2135    rtems_bdbuf_show_users (kind, bd);
2136  }
2137  rtems_bdbuf_lock_cache();
2138
2139  return RTEMS_SUCCESSFUL;
2140}
2141
2142rtems_status_code
2143rtems_bdbuf_release (rtems_bdbuf_buffer *bd)
2144{
2145  rtems_status_code sc = RTEMS_SUCCESSFUL;
2146
2147  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "release");
2148  if (sc != RTEMS_SUCCESSFUL)
2149    return sc;
2150
2151  switch (bd->state)
2152  {
2153    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2154      rtems_bdbuf_add_to_lru_list_after_access (bd);
2155      break;
2156    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2157    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2158      rtems_bdbuf_discard_buffer_after_access (bd);
2159      break;
2160    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2161      rtems_bdbuf_add_to_modified_list_after_access (bd);
2162      break;
2163    default:
2164      rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_0);
2165      break;
2166  }
2167
2168  if (rtems_bdbuf_tracer)
2169    rtems_bdbuf_show_usage ();
2170
2171  rtems_bdbuf_unlock_cache ();
2172
2173  return RTEMS_SUCCESSFUL;
2174}
2175
2176rtems_status_code
2177rtems_bdbuf_release_modified (rtems_bdbuf_buffer *bd)
2178{
2179  rtems_status_code sc = RTEMS_SUCCESSFUL;
2180
2181  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "release modified");
2182  if (sc != RTEMS_SUCCESSFUL)
2183    return sc;
2184
2185  switch (bd->state)
2186  {
2187    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2188    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2189    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2190      rtems_bdbuf_add_to_modified_list_after_access (bd);
2191      break;
2192    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2193      rtems_bdbuf_discard_buffer_after_access (bd);
2194      break;
2195    default:
2196      rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_6);
2197      break;
2198  }
2199
2200  if (rtems_bdbuf_tracer)
2201    rtems_bdbuf_show_usage ();
2202
2203  rtems_bdbuf_unlock_cache ();
2204
2205  return RTEMS_SUCCESSFUL;
2206}
2207
2208rtems_status_code
2209rtems_bdbuf_sync (rtems_bdbuf_buffer *bd)
2210{
2211  rtems_status_code sc = RTEMS_SUCCESSFUL;
2212
2213  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "sync");
2214  if (sc != RTEMS_SUCCESSFUL)
2215    return sc;
2216
2217  switch (bd->state)
2218  {
2219    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2220    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2221    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2222      rtems_bdbuf_sync_after_access (bd);
2223      break;
2224    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2225      rtems_bdbuf_discard_buffer_after_access (bd);
2226      break;
2227    default:
2228      rtems_bdbuf_fatal (bd->state, RTEMS_BLKDEV_FATAL_BDBUF_STATE_5);
2229      break;
2230  }
2231
2232  if (rtems_bdbuf_tracer)
2233    rtems_bdbuf_show_usage ();
2234
2235  rtems_bdbuf_unlock_cache ();
2236
2237  return RTEMS_SUCCESSFUL;
2238}
2239
2240rtems_status_code
2241rtems_bdbuf_syncdev (rtems_disk_device *dd)
2242{
2243  if (rtems_bdbuf_tracer)
2244    printf ("bdbuf:syncdev: %08x\n", (unsigned) dd->dev);
2245
2246  /*
2247   * Take the sync lock before locking the cache. Once we have the sync lock we
2248   * can lock the cache. If another thread has the sync lock it will cause this
2249   * thread to block until it owns the sync lock then it can own the cache. The
2250   * sync lock can only be obtained with the cache unlocked.
2251   */
2252  rtems_bdbuf_lock_sync ();
2253  rtems_bdbuf_lock_cache ();
2254
2255  /*
2256   * Set the cache to have a sync active for a specific device and let the swap
2257   * out task know the id of the requester to wake when done.
2258   *
2259   * The swap out task will negate the sync active flag when no more buffers
2260   * for the device are held on the "modified for sync" queues.
2261   */
2262  bdbuf_cache.sync_active    = true;
2263  bdbuf_cache.sync_requester = rtems_task_self ();
2264  bdbuf_cache.sync_device    = dd;
2265
2266  rtems_bdbuf_wake_swapper ();
2267  rtems_bdbuf_unlock_cache ();
2268  rtems_bdbuf_wait_for_event (RTEMS_BDBUF_TRANSFER_SYNC);
2269  rtems_bdbuf_unlock_sync ();
2270
2271  return RTEMS_SUCCESSFUL;
2272}
2273
2274/**
2275 * Swapout transfer to the driver. The driver will break this I/O into groups
2276 * of consecutive write requests is multiple consecutive buffers are required
2277 * by the driver. The cache is not locked.
2278 *
2279 * @param transfer The transfer transaction.
2280 */
2281static void
2282rtems_bdbuf_swapout_write (rtems_bdbuf_swapout_transfer* transfer)
2283{
2284  rtems_chain_node *node;
2285
2286  if (rtems_bdbuf_tracer)
2287    printf ("bdbuf:swapout transfer: %08x\n", (unsigned) transfer->dd->dev);
2288
2289  /*
2290   * If there are buffers to transfer to the media transfer them.
2291   */
2292  if (!rtems_chain_is_empty (&transfer->bds))
2293  {
2294    /*
2295     * The last block number used when the driver only supports
2296     * continuous blocks in a single request.
2297     */
2298    uint32_t last_block = 0;
2299
2300    /*
2301     * Number of buffers per bd. This is used to detect the next
2302     * block.
2303     */
2304    uint32_t bufs_per_bd = 0;
2305
2306    rtems_disk_device *dd = transfer->dd;
2307
2308    bufs_per_bd = dd->block_size / bdbuf_config.buffer_min;
2309
2310    /*
2311     * Take as many buffers as configured and pass to the driver. Note, the
2312     * API to the drivers has an array of buffers and if a chain was passed
2313     * we could have just passed the list. If the driver API is updated it
2314     * should be possible to make this change with little effect in this
2315     * code. The array that is passed is broken in design and should be
2316     * removed. Merging members of a struct into the first member is
2317     * trouble waiting to happen.
2318     */
2319    transfer->write_req->status = RTEMS_RESOURCE_IN_USE;
2320    transfer->write_req->bufnum = 0;
2321
2322    while ((node = rtems_chain_get_unprotected(&transfer->bds)) != NULL)
2323    {
2324      rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
2325      bool                write = false;
2326
2327      /*
2328       * If the device only accepts sequential buffers and this is not the
2329       * first buffer (the first is always sequential, and the buffer is not
2330       * sequential then put the buffer back on the transfer chain and write
2331       * the committed buffers.
2332       */
2333
2334      if (rtems_bdbuf_tracer)
2335        printf ("bdbuf:swapout write: bd:%" PRIu32 ", bufnum:%" PRIu32 " mode:%s\n",
2336                bd->block, transfer->write_req->bufnum,
2337                dd->phys_dev->capabilities &
2338                RTEMS_BLKDEV_CAP_MULTISECTOR_CONT ? "MULIT" : "SCAT");
2339
2340      if ((dd->phys_dev->capabilities & RTEMS_BLKDEV_CAP_MULTISECTOR_CONT) &&
2341          transfer->write_req->bufnum &&
2342          (bd->block != (last_block + bufs_per_bd)))
2343      {
2344        rtems_chain_prepend_unprotected (&transfer->bds, &bd->link);
2345        write = true;
2346      }
2347      else
2348      {
2349        rtems_blkdev_sg_buffer* buf;
2350        buf = &transfer->write_req->bufs[transfer->write_req->bufnum];
2351        transfer->write_req->bufnum++;
2352        buf->user   = bd;
2353        buf->block  = bd->block;
2354        buf->length = dd->block_size;
2355        buf->buffer = bd->buffer;
2356        last_block  = bd->block;
2357      }
2358
2359      /*
2360       * Perform the transfer if there are no more buffers, or the transfer
2361       * size has reached the configured max. value.
2362       */
2363
2364      if (rtems_chain_is_empty (&transfer->bds) ||
2365          (transfer->write_req->bufnum >= bdbuf_config.max_write_blocks))
2366        write = true;
2367
2368      if (write)
2369      {
2370        rtems_bdbuf_execute_transfer_request (dd, transfer->write_req, false);
2371
2372        transfer->write_req->status = RTEMS_RESOURCE_IN_USE;
2373        transfer->write_req->bufnum = 0;
2374      }
2375    }
2376
2377    /*
2378     * If sync'ing and the deivce is capability of handling a sync IO control
2379     * call perform the call.
2380     */
2381    if (transfer->syncing &&
2382        (dd->phys_dev->capabilities & RTEMS_BLKDEV_CAP_SYNC))
2383    {
2384      /* int result = */ dd->ioctl (dd->phys_dev, RTEMS_BLKDEV_REQ_SYNC, NULL);
2385      /* How should the error be handled ? */
2386    }
2387  }
2388}
2389
2390/**
2391 * Process the modified list of buffers. There is a sync or modified list that
2392 * needs to be handled so we have a common function to do the work.
2393 *
2394 * @param dd_ptr Pointer to the device to handle. If BDBUF_INVALID_DEV no
2395 * device is selected so select the device of the first buffer to be written to
2396 * disk.
2397 * @param chain The modified chain to process.
2398 * @param transfer The chain to append buffers to be written too.
2399 * @param sync_active If true this is a sync operation so expire all timers.
2400 * @param update_timers If true update the timers.
2401 * @param timer_delta It update_timers is true update the timers by this
2402 *                    amount.
2403 */
2404static void
2405rtems_bdbuf_swapout_modified_processing (rtems_disk_device  **dd_ptr,
2406                                         rtems_chain_control* chain,
2407                                         rtems_chain_control* transfer,
2408                                         bool                 sync_active,
2409                                         bool                 update_timers,
2410                                         uint32_t             timer_delta)
2411{
2412  if (!rtems_chain_is_empty (chain))
2413  {
2414    rtems_chain_node* node = rtems_chain_head (chain);
2415    bool              sync_all;
2416   
2417    node = node->next;
2418
2419    /*
2420     * A sync active with no valid dev means sync all.
2421     */
2422    if (sync_active && (*dd_ptr == BDBUF_INVALID_DEV))
2423      sync_all = true;
2424    else
2425      sync_all = false;
2426   
2427    while (!rtems_chain_is_tail (chain, node))
2428    {
2429      rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
2430
2431      /*
2432       * Check if the buffer's hold timer has reached 0. If a sync is active
2433       * or someone waits for a buffer written force all the timers to 0.
2434       *
2435       * @note Lots of sync requests will skew this timer. It should be based
2436       *       on TOD to be accurate. Does it matter ?
2437       */
2438      if (sync_all || (sync_active && (*dd_ptr == bd->dd))
2439          || rtems_bdbuf_has_buffer_waiters ())
2440        bd->hold_timer = 0;
2441
2442      if (bd->hold_timer)
2443      {
2444        if (update_timers)
2445        {
2446          if (bd->hold_timer > timer_delta)
2447            bd->hold_timer -= timer_delta;
2448          else
2449            bd->hold_timer = 0;
2450        }
2451
2452        if (bd->hold_timer)
2453        {
2454          node = node->next;
2455          continue;
2456        }
2457      }
2458
2459      /*
2460       * This assumes we can set it to BDBUF_INVALID_DEV which is just an
2461       * assumption. Cannot use the transfer list being empty the sync dev
2462       * calls sets the dev to use.
2463       */
2464      if (*dd_ptr == BDBUF_INVALID_DEV)
2465        *dd_ptr = bd->dd;
2466
2467      if (bd->dd == *dd_ptr)
2468      {
2469        rtems_chain_node* next_node = node->next;
2470        rtems_chain_node* tnode = rtems_chain_tail (transfer);
2471
2472        /*
2473         * The blocks on the transfer list are sorted in block order. This
2474         * means multi-block transfers for drivers that require consecutive
2475         * blocks perform better with sorted blocks and for real disks it may
2476         * help lower head movement.
2477         */
2478
2479        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
2480
2481        rtems_chain_extract_unprotected (node);
2482
2483        tnode = tnode->previous;
2484
2485        while (node && !rtems_chain_is_head (transfer, tnode))
2486        {
2487          rtems_bdbuf_buffer* tbd = (rtems_bdbuf_buffer*) tnode;
2488
2489          if (bd->block > tbd->block)
2490          {
2491            rtems_chain_insert_unprotected (tnode, node);
2492            node = NULL;
2493          }
2494          else
2495            tnode = tnode->previous;
2496        }
2497
2498        if (node)
2499          rtems_chain_prepend_unprotected (transfer, node);
2500
2501        node = next_node;
2502      }
2503      else
2504      {
2505        node = node->next;
2506      }
2507    }
2508  }
2509}
2510
2511/**
2512 * Process the cache's modified buffers. Check the sync list first then the
2513 * modified list extracting the buffers suitable to be written to disk. We have
2514 * a device at a time. The task level loop will repeat this operation while
2515 * there are buffers to be written. If the transfer fails place the buffers
2516 * back on the modified list and try again later. The cache is unlocked while
2517 * the buffers are being written to disk.
2518 *
2519 * @param timer_delta It update_timers is true update the timers by this
2520 *                    amount.
2521 * @param update_timers If true update the timers.
2522 * @param transfer The transfer transaction data.
2523 *
2524 * @retval true Buffers where written to disk so scan again.
2525 * @retval false No buffers where written to disk.
2526 */
2527static bool
2528rtems_bdbuf_swapout_processing (unsigned long                 timer_delta,
2529                                bool                          update_timers,
2530                                rtems_bdbuf_swapout_transfer* transfer)
2531{
2532  rtems_bdbuf_swapout_worker* worker;
2533  bool                        transfered_buffers = false;
2534
2535  rtems_bdbuf_lock_cache ();
2536
2537  /*
2538   * If a sync is active do not use a worker because the current code does not
2539   * cleaning up after. We need to know the buffers have been written when
2540   * syncing to release sync lock and currently worker threads do not return to
2541   * here. We do not know the worker is the last in a sequence of sync writes
2542   * until after we have it running so we do not know to tell it to release the
2543   * lock. The simplest solution is to get the main swap out task perform all
2544   * sync operations.
2545   */
2546  if (bdbuf_cache.sync_active)
2547    worker = NULL;
2548  else
2549  {
2550    worker = (rtems_bdbuf_swapout_worker*)
2551      rtems_chain_get_unprotected (&bdbuf_cache.swapout_workers);
2552    if (worker)
2553      transfer = &worker->transfer;
2554  }
2555
2556  rtems_chain_initialize_empty (&transfer->bds);
2557  transfer->dd = BDBUF_INVALID_DEV;
2558  transfer->syncing = bdbuf_cache.sync_active;
2559 
2560  /*
2561   * When the sync is for a device limit the sync to that device. If the sync
2562   * is for a buffer handle process the devices in the order on the sync
2563   * list. This means the dev is BDBUF_INVALID_DEV.
2564   */
2565  if (bdbuf_cache.sync_active)
2566    transfer->dd = bdbuf_cache.sync_device;
2567   
2568  /*
2569   * If we have any buffers in the sync queue move them to the modified
2570   * list. The first sync buffer will select the device we use.
2571   */
2572  rtems_bdbuf_swapout_modified_processing (&transfer->dd,
2573                                           &bdbuf_cache.sync,
2574                                           &transfer->bds,
2575                                           true, false,
2576                                           timer_delta);
2577
2578  /*
2579   * Process the cache's modified list.
2580   */
2581  rtems_bdbuf_swapout_modified_processing (&transfer->dd,
2582                                           &bdbuf_cache.modified,
2583                                           &transfer->bds,
2584                                           bdbuf_cache.sync_active,
2585                                           update_timers,
2586                                           timer_delta);
2587
2588  /*
2589   * We have all the buffers that have been modified for this device so the
2590   * cache can be unlocked because the state of each buffer has been set to
2591   * TRANSFER.
2592   */
2593  rtems_bdbuf_unlock_cache ();
2594
2595  /*
2596   * If there are buffers to transfer to the media transfer them.
2597   */
2598  if (!rtems_chain_is_empty (&transfer->bds))
2599  {
2600    if (worker)
2601    {
2602      rtems_status_code sc = rtems_event_send (worker->id,
2603                                               RTEMS_BDBUF_SWAPOUT_SYNC);
2604      if (sc != RTEMS_SUCCESSFUL)
2605        rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_WAKE);
2606    }
2607    else
2608    {
2609      rtems_bdbuf_swapout_write (transfer);
2610    }
2611
2612    transfered_buffers = true;
2613  }
2614
2615  if (bdbuf_cache.sync_active && !transfered_buffers)
2616  {
2617    rtems_id sync_requester;
2618    rtems_bdbuf_lock_cache ();
2619    sync_requester = bdbuf_cache.sync_requester;
2620    bdbuf_cache.sync_active = false;
2621    bdbuf_cache.sync_requester = 0;
2622    rtems_bdbuf_unlock_cache ();
2623    if (sync_requester)
2624      rtems_event_send (sync_requester, RTEMS_BDBUF_TRANSFER_SYNC);
2625  }
2626
2627  return transfered_buffers;
2628}
2629
2630/**
2631 * Allocate the write request and initialise it for good measure.
2632 *
2633 * @return rtems_blkdev_request* The write reference memory.
2634 */
2635static rtems_blkdev_request*
2636rtems_bdbuf_swapout_writereq_alloc (void)
2637{
2638  /*
2639   * @note chrisj The rtems_blkdev_request and the array at the end is a hack.
2640   * I am disappointment at finding code like this in RTEMS. The request should
2641   * have been a rtems_chain_control. Simple, fast and less storage as the node
2642   * is already part of the buffer structure.
2643   */
2644  rtems_blkdev_request* write_req =
2645    malloc (sizeof (rtems_blkdev_request) +
2646            (bdbuf_config.max_write_blocks * sizeof (rtems_blkdev_sg_buffer)));
2647
2648  if (!write_req)
2649    rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_NOMEM);
2650
2651  write_req->req = RTEMS_BLKDEV_REQ_WRITE;
2652  write_req->req_done = rtems_bdbuf_transfer_done;
2653  write_req->done_arg = write_req;
2654  write_req->io_task = rtems_task_self ();
2655
2656  return write_req;
2657}
2658
2659/**
2660 * The swapout worker thread body.
2661 *
2662 * @param arg A pointer to the worker thread's private data.
2663 * @return rtems_task Not used.
2664 */
2665static rtems_task
2666rtems_bdbuf_swapout_worker_task (rtems_task_argument arg)
2667{
2668  rtems_bdbuf_swapout_worker* worker = (rtems_bdbuf_swapout_worker*) arg;
2669
2670  while (worker->enabled)
2671  {
2672    rtems_bdbuf_wait_for_event (RTEMS_BDBUF_SWAPOUT_SYNC);
2673
2674    rtems_bdbuf_swapout_write (&worker->transfer);
2675
2676    rtems_bdbuf_lock_cache ();
2677
2678    rtems_chain_initialize_empty (&worker->transfer.bds);
2679    worker->transfer.dd = BDBUF_INVALID_DEV;
2680
2681    rtems_chain_append_unprotected (&bdbuf_cache.swapout_workers, &worker->link);
2682
2683    rtems_bdbuf_unlock_cache ();
2684  }
2685
2686  free (worker->transfer.write_req);
2687  free (worker);
2688
2689  rtems_task_delete (RTEMS_SELF);
2690}
2691
2692/**
2693 * Open the swapout worker threads.
2694 */
2695static void
2696rtems_bdbuf_swapout_workers_open (void)
2697{
2698  rtems_status_code sc;
2699  size_t            w;
2700
2701  rtems_bdbuf_lock_cache ();
2702
2703  for (w = 0; w < bdbuf_config.swapout_workers; w++)
2704  {
2705    rtems_bdbuf_swapout_worker* worker;
2706
2707    worker = malloc (sizeof (rtems_bdbuf_swapout_worker));
2708    if (!worker)
2709      rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_NOMEM);
2710
2711    rtems_chain_append_unprotected (&bdbuf_cache.swapout_workers, &worker->link);
2712    worker->enabled = true;
2713    worker->transfer.write_req = rtems_bdbuf_swapout_writereq_alloc ();
2714
2715    rtems_chain_initialize_empty (&worker->transfer.bds);
2716    worker->transfer.dd = BDBUF_INVALID_DEV;
2717
2718    sc = rtems_bdbuf_create_task (rtems_build_name('B', 'D', 'o', 'a' + w),
2719                                  bdbuf_config.swapout_worker_priority,
2720                                  RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT,
2721                                  rtems_bdbuf_swapout_worker_task,
2722                                  (rtems_task_argument) worker,
2723                                  &worker->id);
2724    if (sc != RTEMS_SUCCESSFUL)
2725      rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_SO_WK_CREATE);
2726  }
2727
2728  rtems_bdbuf_unlock_cache ();
2729}
2730
2731/**
2732 * Close the swapout worker threads.
2733 */
2734static void
2735rtems_bdbuf_swapout_workers_close (void)
2736{
2737  rtems_chain_node* node;
2738
2739  rtems_bdbuf_lock_cache ();
2740
2741  node = rtems_chain_first (&bdbuf_cache.swapout_workers);
2742  while (!rtems_chain_is_tail (&bdbuf_cache.swapout_workers, node))
2743  {
2744    rtems_bdbuf_swapout_worker* worker = (rtems_bdbuf_swapout_worker*) node;
2745    worker->enabled = false;
2746    rtems_event_send (worker->id, RTEMS_BDBUF_SWAPOUT_SYNC);
2747    node = rtems_chain_next (node);
2748  }
2749
2750  rtems_bdbuf_unlock_cache ();
2751}
2752
2753/**
2754 * Body of task which takes care on flushing modified buffers to the disk.
2755 *
2756 * @param arg A pointer to the global cache data. Use the global variable and
2757 *            not this.
2758 * @return rtems_task Not used.
2759 */
2760static rtems_task
2761rtems_bdbuf_swapout_task (rtems_task_argument arg)
2762{
2763  rtems_bdbuf_swapout_transfer transfer;
2764  uint32_t                     period_in_ticks;
2765  const uint32_t               period_in_msecs = bdbuf_config.swapout_period;
2766  uint32_t                     timer_delta;
2767
2768  transfer.write_req = rtems_bdbuf_swapout_writereq_alloc ();
2769  rtems_chain_initialize_empty (&transfer.bds);
2770  transfer.dd = BDBUF_INVALID_DEV;
2771  transfer.syncing = false;
2772
2773  /*
2774   * Localise the period.
2775   */
2776  period_in_ticks = RTEMS_MICROSECONDS_TO_TICKS (period_in_msecs * 1000);
2777
2778  /*
2779   * This is temporary. Needs to be changed to use the real time clock.
2780   */
2781  timer_delta = period_in_msecs;
2782
2783  /*
2784   * Create the worker threads.
2785   */
2786  rtems_bdbuf_swapout_workers_open ();
2787
2788  while (bdbuf_cache.swapout_enabled)
2789  {
2790    rtems_event_set   out;
2791    rtems_status_code sc;
2792
2793    /*
2794     * Only update the timers once in the processing cycle.
2795     */
2796    bool update_timers = true;
2797
2798    /*
2799     * If we write buffers to any disk perform a check again. We only write a
2800     * single device at a time and the cache may have more than one device's
2801     * buffers modified waiting to be written.
2802     */
2803    bool transfered_buffers;
2804
2805    do
2806    {
2807      transfered_buffers = false;
2808
2809      /*
2810       * Extact all the buffers we find for a specific device. The device is
2811       * the first one we find on a modified list. Process the sync queue of
2812       * buffers first.
2813       */
2814      if (rtems_bdbuf_swapout_processing (timer_delta,
2815                                          update_timers,
2816                                          &transfer))
2817      {
2818        transfered_buffers = true;
2819      }
2820
2821      /*
2822       * Only update the timers once.
2823       */
2824      update_timers = false;
2825    }
2826    while (transfered_buffers);
2827
2828    sc = rtems_event_receive (RTEMS_BDBUF_SWAPOUT_SYNC,
2829                              RTEMS_EVENT_ALL | RTEMS_WAIT,
2830                              period_in_ticks,
2831                              &out);
2832
2833    if ((sc != RTEMS_SUCCESSFUL) && (sc != RTEMS_TIMEOUT))
2834      rtems_fatal_error_occurred (BLKDEV_FATAL_BDBUF_SWAPOUT_RE);
2835  }
2836
2837  rtems_bdbuf_swapout_workers_close ();
2838
2839  free (transfer.write_req);
2840
2841  rtems_task_delete (RTEMS_SELF);
2842}
2843
2844static void
2845rtems_bdbuf_purge_list (rtems_chain_control *purge_list)
2846{
2847  bool wake_buffer_waiters = false;
2848  rtems_chain_node *node = NULL;
2849
2850  while ((node = rtems_chain_get_unprotected (purge_list)) != NULL)
2851  {
2852    rtems_bdbuf_buffer *bd = (rtems_bdbuf_buffer *) node;
2853
2854    if (bd->waiters == 0)
2855      wake_buffer_waiters = true;
2856
2857    rtems_bdbuf_discard_buffer (bd);
2858  }
2859
2860  if (wake_buffer_waiters)
2861    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
2862}
2863
2864static void
2865rtems_bdbuf_gather_for_purge (rtems_chain_control *purge_list,
2866                              const rtems_disk_device *dd)
2867{
2868  rtems_bdbuf_buffer *stack [RTEMS_BDBUF_AVL_MAX_HEIGHT];
2869  rtems_bdbuf_buffer **prev = stack;
2870  rtems_bdbuf_buffer *cur = bdbuf_cache.tree;
2871
2872  *prev = NULL;
2873
2874  while (cur != NULL)
2875  {
2876    if (cur->dd == dd)
2877    {
2878      switch (cur->state)
2879      {
2880        case RTEMS_BDBUF_STATE_FREE:
2881        case RTEMS_BDBUF_STATE_EMPTY:
2882        case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2883        case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
2884          break;
2885        case RTEMS_BDBUF_STATE_SYNC:
2886          rtems_bdbuf_wake (&bdbuf_cache.transfer_waiters);
2887          /* Fall through */
2888        case RTEMS_BDBUF_STATE_MODIFIED:
2889          rtems_bdbuf_group_release (cur);
2890          /* Fall through */
2891        case RTEMS_BDBUF_STATE_CACHED:
2892          rtems_chain_extract_unprotected (&cur->link);
2893          rtems_chain_append_unprotected (purge_list, &cur->link);
2894          break;
2895        case RTEMS_BDBUF_STATE_TRANSFER:
2896          rtems_bdbuf_set_state (cur, RTEMS_BDBUF_STATE_TRANSFER_PURGED);
2897          break;
2898        case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2899        case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2900        case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2901          rtems_bdbuf_set_state (cur, RTEMS_BDBUF_STATE_ACCESS_PURGED);
2902          break;
2903        default:
2904          rtems_fatal_error_occurred (RTEMS_BLKDEV_FATAL_BDBUF_STATE_11);
2905      }
2906    }
2907
2908    if (cur->avl.left != NULL)
2909    {
2910      /* Left */
2911      ++prev;
2912      *prev = cur;
2913      cur = cur->avl.left;
2914    }
2915    else if (cur->avl.right != NULL)
2916    {
2917      /* Right */
2918      ++prev;
2919      *prev = cur;
2920      cur = cur->avl.right;
2921    }
2922    else
2923    {
2924      while (*prev != NULL
2925             && (cur == (*prev)->avl.right || (*prev)->avl.right == NULL))
2926      {
2927        /* Up */
2928        cur = *prev;
2929        --prev;
2930      }
2931      if (*prev != NULL)
2932        /* Right */
2933        cur = (*prev)->avl.right;
2934      else
2935        /* Finished */
2936        cur = NULL;
2937    }
2938  }
2939}
2940
2941void
2942rtems_bdbuf_purge_dev (rtems_disk_device *dd)
2943{
2944  rtems_chain_control purge_list;
2945
2946  rtems_chain_initialize_empty (&purge_list);
2947  rtems_bdbuf_lock_cache ();
2948  rtems_bdbuf_read_ahead_reset (dd);
2949  rtems_bdbuf_gather_for_purge (&purge_list, dd);
2950  rtems_bdbuf_purge_list (&purge_list);
2951  rtems_bdbuf_unlock_cache ();
2952}
2953
2954rtems_status_code
2955rtems_bdbuf_set_block_size (rtems_disk_device *dd, uint32_t block_size)
2956{
2957  rtems_status_code sc = RTEMS_SUCCESSFUL;
2958
2959  rtems_bdbuf_lock_cache ();
2960
2961  if (block_size > 0)
2962  {
2963    size_t bds_per_group = rtems_bdbuf_bds_per_group (block_size);
2964
2965    if (bds_per_group != 0)
2966    {
2967      int block_to_media_block_shift = 0;
2968      uint32_t media_blocks_per_block = block_size / dd->media_block_size;
2969      uint32_t one = 1;
2970
2971      while ((one << block_to_media_block_shift) < media_blocks_per_block)
2972      {
2973        ++block_to_media_block_shift;
2974      }
2975
2976      if ((dd->media_block_size << block_to_media_block_shift) != block_size)
2977        block_to_media_block_shift = -1;
2978
2979      dd->block_size = block_size;
2980      dd->block_count = dd->size / media_blocks_per_block;
2981      dd->media_blocks_per_block = media_blocks_per_block;
2982      dd->block_to_media_block_shift = block_to_media_block_shift;
2983      dd->bds_per_group = bds_per_group;
2984
2985      rtems_bdbuf_read_ahead_reset (dd);
2986    }
2987    else
2988    {
2989      sc = RTEMS_INVALID_NUMBER;
2990    }
2991  }
2992  else
2993  {
2994    sc = RTEMS_INVALID_NUMBER;
2995  }
2996
2997  rtems_bdbuf_unlock_cache ();
2998
2999  return sc;
3000}
3001
3002static rtems_task
3003rtems_bdbuf_read_ahead_task (rtems_task_argument arg)
3004{
3005  rtems_chain_control *chain = &bdbuf_cache.read_ahead_chain;
3006
3007  while (bdbuf_cache.read_ahead_enabled)
3008  {
3009    rtems_chain_node *node;
3010
3011    rtems_bdbuf_wait_for_event (RTEMS_BDBUF_READ_AHEAD_WAKE_UP);
3012    rtems_bdbuf_lock_cache ();
3013
3014    while ((node = rtems_chain_get_unprotected (chain)) != NULL)
3015    {
3016      rtems_disk_device *dd = (rtems_disk_device *)
3017        ((char *) node - offsetof (rtems_disk_device, read_ahead.node));
3018      rtems_blkdev_bnum block = dd->read_ahead.next;
3019      rtems_blkdev_bnum media_block = 0;
3020      rtems_status_code sc =
3021        rtems_bdbuf_get_media_block (dd, block, &media_block);
3022
3023      rtems_chain_set_off_chain (&dd->read_ahead.node);
3024
3025      if (sc == RTEMS_SUCCESSFUL)
3026      {
3027        rtems_bdbuf_buffer *bd =
3028          rtems_bdbuf_get_buffer_for_read_ahead (dd, media_block);
3029
3030        if (bd != NULL)
3031        {
3032          uint32_t transfer_count = dd->block_count - block;
3033          uint32_t max_transfer_count = bdbuf_config.max_read_ahead_blocks;
3034
3035          if (transfer_count >= max_transfer_count)
3036          {
3037            transfer_count = max_transfer_count;
3038            dd->read_ahead.trigger += max_transfer_count / 2 + 1;
3039            dd->read_ahead.next += max_transfer_count;
3040          }
3041          else
3042          {
3043            dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
3044          }
3045
3046          ++dd->stats.read_ahead_transfers;
3047          rtems_bdbuf_execute_read_request (dd, bd, transfer_count);
3048        }
3049      }
3050      else
3051      {
3052        dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
3053      }
3054    }
3055
3056    rtems_bdbuf_unlock_cache ();
3057  }
3058
3059  rtems_task_delete (RTEMS_SELF);
3060}
3061
3062void rtems_bdbuf_get_device_stats (const rtems_disk_device *dd,
3063                                   rtems_blkdev_stats      *stats)
3064{
3065  rtems_bdbuf_lock_cache ();
3066  *stats = dd->stats;
3067  rtems_bdbuf_unlock_cache ();
3068}
3069
3070void rtems_bdbuf_reset_device_stats (rtems_disk_device *dd)
3071{
3072  rtems_bdbuf_lock_cache ();
3073  memset (&dd->stats, 0, sizeof(dd->stats));
3074  rtems_bdbuf_unlock_cache ();
3075}
Note: See TracBrowser for help on using the repository browser.