source: rtems/cpukit/libblock/src/bdbuf.c @ 3b4ca3a

4.11
Last change on this file since 3b4ca3a was 3b4ca3a, checked in by Sebastian Huber <sebastian.huber@…>, on Nov 27, 2014 at 1:41:17 PM

bdbuf: Fix race condition with sync active flag

Bug report by Oleg Kravtsov:

In rtems_bdbuf_swapout_processing() function there is the following
lines:

if (bdbuf_cache.sync_active && !transfered_buffers)
{

rtems_id sync_requester;
rtems_bdbuf_lock_cache ();
...

}

Here access to bdbuf_cache.sync_active is not protected with anything.
Imagine the following test case:

  1. Task1 releases buffer(s) with bdbuf_release_modified() calls;
  1. After a while swapout task starts and flushes all buffers;
  1. In the end of that swapout flush we are before that part of code, and

assume there is task switching (just before "if (bdbuf_cache.sync_active
&& !transfered_buffers)");

  1. Some other task (with higher priority) does bdbuf_release_modified

and rtems_bdbuf_syncdev().

This task successfully gets both locks sync and pool (in
rtems_bdbuf_syncdev() function), sets sync_active to true and starts
waiting for RTEMS_BDBUF_TRANSFER_SYNC event with only sync lock got.

  1. Task switching happens again and we are again before "if

(bdbuf_cache.sync_active && !transfered_buffers)".

As the result we check sync_active and we come inside that "if"
statement.

  1. The result is that we send RTEMS_BDBUF_TRANSFER_SYNC event! Though

ALL modified messages of that task are not flushed yet!

close #1485

  • Property mode set to 100644
File size: 85.6 KB
Line 
1/**
2 * @file
3 *
4 * @ingroup rtems_bdbuf
5 *
6 * Block device buffer management.
7 */
8
9/*
10 * Disk I/O buffering
11 * Buffer managment
12 *
13 * Copyright (C) 2001 OKTET Ltd., St.-Peterburg, Russia
14 * Author: Andrey G. Ivanov <Andrey.Ivanov@oktet.ru>
15 *         Victor V. Vengerov <vvv@oktet.ru>
16 *         Alexander Kukuta <kam@oktet.ru>
17 *
18 * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org>
19 *    Rewritten to remove score mutex access. Fixes many performance
20 *    issues.
21 *
22 * Copyright (c) 2009-2014 embedded brains GmbH.
23 */
24
25/**
26 * Set to 1 to enable debug tracing.
27 */
28#define RTEMS_BDBUF_TRACE 0
29
30#if HAVE_CONFIG_H
31#include "config.h"
32#endif
33#include <limits.h>
34#include <errno.h>
35#include <stdio.h>
36#include <string.h>
37#include <inttypes.h>
38#include <pthread.h>
39
40#include <rtems.h>
41#include <rtems/error.h>
42
43#include "rtems/bdbuf.h"
44
45#define BDBUF_INVALID_DEV NULL
46
47/*
48 * Simpler label for this file.
49 */
50#define bdbuf_config rtems_bdbuf_configuration
51
52/**
53 * A swapout transfer transaction data. This data is passed to a worked thread
54 * to handle the write phase of the transfer.
55 */
56typedef struct rtems_bdbuf_swapout_transfer
57{
58  rtems_chain_control   bds;         /**< The transfer list of BDs. */
59  rtems_disk_device    *dd;          /**< The device the transfer is for. */
60  bool                  syncing;     /**< The data is a sync'ing. */
61  rtems_blkdev_request  write_req;   /**< The write request. */
62} rtems_bdbuf_swapout_transfer;
63
64/**
65 * Swapout worker thread. These are available to take processing from the
66 * main swapout thread and handle the I/O operation.
67 */
68typedef struct rtems_bdbuf_swapout_worker
69{
70  rtems_chain_node             link;     /**< The threads sit on a chain when
71                                          * idle. */
72  rtems_id                     id;       /**< The id of the task so we can wake
73                                          * it. */
74  bool                         enabled;  /**< The worker is enabled. */
75  rtems_bdbuf_swapout_transfer transfer; /**< The transfer data for this
76                                          * thread. */
77} rtems_bdbuf_swapout_worker;
78
79#if defined(RTEMS_BDBUF_USE_PTHREAD)
80typedef pthread_mutex_t rtems_bdbuf_lock_type;
81#else
82typedef rtems_id rtems_bdbuf_lock_type;
83#endif
84
85/**
86 * Buffer waiters synchronization.
87 */
88typedef struct rtems_bdbuf_waiters {
89  unsigned       count;
90#if defined(RTEMS_BDBUF_USE_PTHREAD)
91  pthread_cond_t cond_var;
92#else
93  rtems_id       sema;
94#endif
95} rtems_bdbuf_waiters;
96
97/**
98 * The BD buffer cache.
99 */
100typedef struct rtems_bdbuf_cache
101{
102  rtems_id            swapout;           /**< Swapout task ID */
103  bool                swapout_enabled;   /**< Swapout is only running if
104                                          * enabled. Set to false to kill the
105                                          * swap out task. It deletes itself. */
106  rtems_chain_control swapout_free_workers; /**< The work threads for the swapout
107                                             * task. */
108
109  rtems_bdbuf_buffer* bds;               /**< Pointer to table of buffer
110                                          * descriptors. */
111  void*               buffers;           /**< The buffer's memory. */
112  size_t              buffer_min_count;  /**< Number of minimum size buffers
113                                          * that fit the buffer memory. */
114  size_t              max_bds_per_group; /**< The number of BDs of minimum
115                                          * buffer size that fit in a group. */
116  uint32_t            flags;             /**< Configuration flags. */
117
118  rtems_bdbuf_lock_type lock;            /**< The cache lock. It locks all
119                                          * cache data, BD and lists. */
120  rtems_bdbuf_lock_type sync_lock;       /**< Sync calls block writes. */
121  bool                sync_active;       /**< True if a sync is active. */
122  rtems_id            sync_requester;    /**< The sync requester. */
123  rtems_disk_device  *sync_device;       /**< The device to sync and
124                                          * BDBUF_INVALID_DEV not a device
125                                          * sync. */
126
127  rtems_bdbuf_buffer* tree;              /**< Buffer descriptor lookup AVL tree
128                                          * root. There is only one. */
129  rtems_chain_control lru;               /**< Least recently used list */
130  rtems_chain_control modified;          /**< Modified buffers list */
131  rtems_chain_control sync;              /**< Buffers to sync list */
132
133  rtems_bdbuf_waiters access_waiters;    /**< Wait for a buffer in
134                                          * ACCESS_CACHED, ACCESS_MODIFIED or
135                                          * ACCESS_EMPTY
136                                          * state. */
137  rtems_bdbuf_waiters transfer_waiters;  /**< Wait for a buffer in TRANSFER
138                                          * state. */
139  rtems_bdbuf_waiters buffer_waiters;    /**< Wait for a buffer and no one is
140                                          * available. */
141
142  rtems_bdbuf_swapout_transfer *swapout_transfer;
143  rtems_bdbuf_swapout_worker *swapout_workers;
144
145  size_t              group_count;       /**< The number of groups. */
146  rtems_bdbuf_group*  groups;            /**< The groups. */
147  rtems_id            read_ahead_task;   /**< Read-ahead task */
148  rtems_chain_control read_ahead_chain;  /**< Read-ahead request chain */
149  bool                read_ahead_enabled; /**< Read-ahead enabled */
150  rtems_status_code   init_status;       /**< The initialization status */
151} rtems_bdbuf_cache;
152
153typedef enum {
154  RTEMS_BDBUF_FATAL_CACHE_LOCK,
155  RTEMS_BDBUF_FATAL_CACHE_UNLOCK,
156  RTEMS_BDBUF_FATAL_CACHE_WAIT_2,
157  RTEMS_BDBUF_FATAL_CACHE_WAIT_TO,
158  RTEMS_BDBUF_FATAL_CACHE_WAKE,
159  RTEMS_BDBUF_FATAL_PREEMPT_DIS,
160  RTEMS_BDBUF_FATAL_PREEMPT_RST,
161  RTEMS_BDBUF_FATAL_RA_WAKE_UP,
162  RTEMS_BDBUF_FATAL_RECYCLE,
163  RTEMS_BDBUF_FATAL_SO_WAKE_1,
164  RTEMS_BDBUF_FATAL_SO_WAKE_2,
165  RTEMS_BDBUF_FATAL_STATE_0,
166  RTEMS_BDBUF_FATAL_STATE_2,
167  RTEMS_BDBUF_FATAL_STATE_4,
168  RTEMS_BDBUF_FATAL_STATE_5,
169  RTEMS_BDBUF_FATAL_STATE_6,
170  RTEMS_BDBUF_FATAL_STATE_7,
171  RTEMS_BDBUF_FATAL_STATE_8,
172  RTEMS_BDBUF_FATAL_STATE_9,
173  RTEMS_BDBUF_FATAL_STATE_10,
174  RTEMS_BDBUF_FATAL_STATE_11,
175  RTEMS_BDBUF_FATAL_SWAPOUT_RE,
176  RTEMS_BDBUF_FATAL_SYNC_LOCK,
177  RTEMS_BDBUF_FATAL_SYNC_UNLOCK,
178  RTEMS_BDBUF_FATAL_TREE_RM,
179  RTEMS_BDBUF_FATAL_WAIT_EVNT,
180  RTEMS_BDBUF_FATAL_WAIT_TRANS_EVNT,
181  RTEMS_BDBUF_FATAL_ONCE,
182  RTEMS_BDBUF_FATAL_MTX_ATTR_INIT,
183  RTEMS_BDBUF_FATAL_MTX_ATTR_SETPROTO,
184  RTEMS_BDBUF_FATAL_CV_WAIT,
185  RTEMS_BDBUF_FATAL_CV_BROADCAST
186} rtems_bdbuf_fatal_code;
187
188/**
189 * The events used in this code. These should be system events rather than
190 * application events.
191 */
192#define RTEMS_BDBUF_SWAPOUT_SYNC   RTEMS_EVENT_2
193#define RTEMS_BDBUF_READ_AHEAD_WAKE_UP RTEMS_EVENT_1
194
195/**
196 * Lock semaphore attributes. This is used for locking type mutexes.
197 *
198 * @warning Priority inheritance is on.
199 */
200#define RTEMS_BDBUF_CACHE_LOCK_ATTRIBS \
201  (RTEMS_PRIORITY | RTEMS_BINARY_SEMAPHORE | \
202   RTEMS_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
203
204/**
205 * Waiter semaphore attributes.
206 *
207 * @warning Do not configure as inherit priority. If a driver is in the driver
208 *          initialisation table this locked semaphore will have the IDLE task
209 *          as the holder and a blocking task will raise the priority of the
210 *          IDLE task which can cause unsual side effects.
211 */
212#define RTEMS_BDBUF_CACHE_WAITER_ATTRIBS \
213  (RTEMS_PRIORITY | RTEMS_SIMPLE_BINARY_SEMAPHORE | \
214   RTEMS_NO_INHERIT_PRIORITY | RTEMS_NO_PRIORITY_CEILING | RTEMS_LOCAL)
215
216/**
217 * Waiter timeout. Set to non-zero to find some info on a waiter that is
218 * waiting too long.
219 */
220#define RTEMS_BDBUF_WAIT_TIMEOUT RTEMS_NO_TIMEOUT
221#if !defined (RTEMS_BDBUF_WAIT_TIMEOUT)
222#define RTEMS_BDBUF_WAIT_TIMEOUT \
223  (RTEMS_MICROSECONDS_TO_TICKS (20000000))
224#endif
225
226static rtems_task rtems_bdbuf_swapout_task(rtems_task_argument arg);
227
228static rtems_task rtems_bdbuf_read_ahead_task(rtems_task_argument arg);
229
230/**
231 * The Buffer Descriptor cache.
232 */
233static rtems_bdbuf_cache bdbuf_cache;
234
235static pthread_once_t rtems_bdbuf_once_state = PTHREAD_ONCE_INIT;
236
237#if RTEMS_BDBUF_TRACE
238/**
239 * If true output the trace message.
240 */
241bool rtems_bdbuf_tracer;
242
243/**
244 * Return the number of items on the list.
245 *
246 * @param list The chain control.
247 * @return uint32_t The number of items on the list.
248 */
249uint32_t
250rtems_bdbuf_list_count (rtems_chain_control* list)
251{
252  rtems_chain_node* node = rtems_chain_first (list);
253  uint32_t          count = 0;
254  while (!rtems_chain_is_tail (list, node))
255  {
256    count++;
257    node = rtems_chain_next (node);
258  }
259  return count;
260}
261
262/**
263 * Show the usage for the bdbuf cache.
264 */
265void
266rtems_bdbuf_show_usage (void)
267{
268  uint32_t group;
269  uint32_t total = 0;
270  uint32_t val;
271
272  for (group = 0; group < bdbuf_cache.group_count; group++)
273    total += bdbuf_cache.groups[group].users;
274  printf ("bdbuf:group users=%lu", total);
275  val = rtems_bdbuf_list_count (&bdbuf_cache.lru);
276  printf (", lru=%lu", val);
277  total = val;
278  val = rtems_bdbuf_list_count (&bdbuf_cache.modified);
279  printf (", mod=%lu", val);
280  total += val;
281  val = rtems_bdbuf_list_count (&bdbuf_cache.sync);
282  printf (", sync=%lu", val);
283  total += val;
284  printf (", total=%lu\n", total);
285}
286
287/**
288 * Show the users for a group of a bd.
289 *
290 * @param where A label to show the context of output.
291 * @param bd The bd to show the users of.
292 */
293void
294rtems_bdbuf_show_users (const char* where, rtems_bdbuf_buffer* bd)
295{
296  const char* states[] =
297    { "FR", "EM", "CH", "AC", "AM", "AE", "AP", "MD", "SY", "TR", "TP" };
298
299  printf ("bdbuf:users: %15s: [%" PRIu32 " (%s)] %td:%td = %" PRIu32 " %s\n",
300          where,
301          bd->block, states[bd->state],
302          bd->group - bdbuf_cache.groups,
303          bd - bdbuf_cache.bds,
304          bd->group->users,
305          bd->group->users > 8 ? "<<<<<<<" : "");
306}
307#else
308#define rtems_bdbuf_tracer (0)
309#define rtems_bdbuf_show_usage() ((void) 0)
310#define rtems_bdbuf_show_users(_w, _b) ((void) 0)
311#endif
312
313/**
314 * The default maximum height of 32 allows for AVL trees having between
315 * 5,704,880 and 4,294,967,295 nodes, depending on order of insertion.  You may
316 * change this compile-time constant as you wish.
317 */
318#ifndef RTEMS_BDBUF_AVL_MAX_HEIGHT
319#define RTEMS_BDBUF_AVL_MAX_HEIGHT (32)
320#endif
321
322static void
323rtems_bdbuf_fatal (rtems_fatal_code error)
324{
325  rtems_fatal (RTEMS_FATAL_SOURCE_BDBUF, error);
326}
327
328static void
329rtems_bdbuf_fatal_with_state (rtems_bdbuf_buf_state state,
330                              rtems_bdbuf_fatal_code error)
331{
332  rtems_bdbuf_fatal ((((uint32_t) state) << 16) | error);
333}
334
335static rtems_status_code
336rtems_bdbuf_lock_create (rtems_name name, rtems_bdbuf_lock_type *lock)
337{
338#if defined(RTEMS_BDBUF_USE_PTHREAD)
339  int                 eno;
340  pthread_mutexattr_t attr;
341
342  (void) name;
343
344  eno = pthread_mutexattr_init (&attr);
345  if (eno != 0)
346    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_MTX_ATTR_INIT);
347
348  eno = pthread_mutexattr_setprotocol (&attr, PTHREAD_PRIO_INHERIT);
349  if (eno != 0)
350    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_MTX_ATTR_SETPROTO);
351
352  eno = pthread_mutex_init (lock, &attr);
353
354  pthread_mutexattr_destroy (&attr);
355
356  if (eno != 0)
357    return RTEMS_UNSATISFIED;
358
359  return RTEMS_SUCCESSFUL;
360#else
361  return rtems_semaphore_create(
362    name,
363    1,
364    RTEMS_BDBUF_CACHE_LOCK_ATTRIBS,
365    0,
366    lock
367  );
368#endif
369}
370
371static void
372rtems_bdbuf_lock_delete (rtems_bdbuf_lock_type *lock)
373{
374#if defined(RTEMS_BDBUF_USE_PTHREAD)
375  pthread_mutex_destroy (lock);
376#else
377  rtems_semaphore_delete (*lock);
378#endif
379}
380
381static rtems_status_code
382rtems_bdbuf_waiter_create (rtems_name name, rtems_bdbuf_waiters *waiter)
383{
384#if defined(RTEMS_BDBUF_USE_PTHREAD)
385  int eno = pthread_cond_init (&waiter->cond_var, NULL);
386  if (eno != 0)
387    return RTEMS_UNSATISFIED;
388
389  return RTEMS_SUCCESSFUL;
390#else
391  return rtems_semaphore_create(
392    name,
393    0,
394    RTEMS_BDBUF_CACHE_WAITER_ATTRIBS,
395    0,
396    &waiter->sema
397  );
398#endif
399}
400
401static void
402rtems_bdbuf_waiter_delete (rtems_bdbuf_waiters *waiter)
403{
404#if defined(RTEMS_BDBUF_USE_PTHREAD)
405  pthread_cond_destroy (&waiter->cond_var);
406#else
407  rtems_semaphore_delete (waiter->sema);
408#endif
409}
410
411/**
412 * Searches for the node with specified dd/block.
413 *
414 * @param root pointer to the root node of the AVL-Tree
415 * @param dd disk device search key
416 * @param block block search key
417 * @retval NULL node with the specified dd/block is not found
418 * @return pointer to the node with specified dd/block
419 */
420static rtems_bdbuf_buffer *
421rtems_bdbuf_avl_search (rtems_bdbuf_buffer** root,
422                        const rtems_disk_device *dd,
423                        rtems_blkdev_bnum    block)
424{
425  rtems_bdbuf_buffer* p = *root;
426
427  while ((p != NULL) && ((p->dd != dd) || (p->block != block)))
428  {
429    if (((uintptr_t) p->dd < (uintptr_t) dd)
430        || ((p->dd == dd) && (p->block < block)))
431    {
432      p = p->avl.right;
433    }
434    else
435    {
436      p = p->avl.left;
437    }
438  }
439
440  return p;
441}
442
443/**
444 * Inserts the specified node to the AVl-Tree.
445 *
446 * @param root pointer to the root node of the AVL-Tree
447 * @param node Pointer to the node to add.
448 * @retval 0 The node added successfully
449 * @retval -1 An error occured
450 */
451static int
452rtems_bdbuf_avl_insert(rtems_bdbuf_buffer** root,
453                       rtems_bdbuf_buffer*  node)
454{
455  const rtems_disk_device *dd = node->dd;
456  rtems_blkdev_bnum block = node->block;
457
458  rtems_bdbuf_buffer*  p = *root;
459  rtems_bdbuf_buffer*  q;
460  rtems_bdbuf_buffer*  p1;
461  rtems_bdbuf_buffer*  p2;
462  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
463  rtems_bdbuf_buffer** buf_prev = buf_stack;
464
465  bool modified = false;
466
467  if (p == NULL)
468  {
469    *root = node;
470    node->avl.left = NULL;
471    node->avl.right = NULL;
472    node->avl.bal = 0;
473    return 0;
474  }
475
476  while (p != NULL)
477  {
478    *buf_prev++ = p;
479
480    if (((uintptr_t) p->dd < (uintptr_t) dd)
481        || ((p->dd == dd) && (p->block < block)))
482    {
483      p->avl.cache = 1;
484      q = p->avl.right;
485      if (q == NULL)
486      {
487        q = node;
488        p->avl.right = q = node;
489        break;
490      }
491    }
492    else if ((p->dd != dd) || (p->block != block))
493    {
494      p->avl.cache = -1;
495      q = p->avl.left;
496      if (q == NULL)
497      {
498        q = node;
499        p->avl.left = q;
500        break;
501      }
502    }
503    else
504    {
505      return -1;
506    }
507
508    p = q;
509  }
510
511  q->avl.left = q->avl.right = NULL;
512  q->avl.bal = 0;
513  modified = true;
514  buf_prev--;
515
516  while (modified)
517  {
518    if (p->avl.cache == -1)
519    {
520      switch (p->avl.bal)
521      {
522        case 1:
523          p->avl.bal = 0;
524          modified = false;
525          break;
526
527        case 0:
528          p->avl.bal = -1;
529          break;
530
531        case -1:
532          p1 = p->avl.left;
533          if (p1->avl.bal == -1) /* simple LL-turn */
534          {
535            p->avl.left = p1->avl.right;
536            p1->avl.right = p;
537            p->avl.bal = 0;
538            p = p1;
539          }
540          else /* double LR-turn */
541          {
542            p2 = p1->avl.right;
543            p1->avl.right = p2->avl.left;
544            p2->avl.left = p1;
545            p->avl.left = p2->avl.right;
546            p2->avl.right = p;
547            if (p2->avl.bal == -1) p->avl.bal = +1; else p->avl.bal = 0;
548            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
549            p = p2;
550          }
551          p->avl.bal = 0;
552          modified = false;
553          break;
554
555        default:
556          break;
557      }
558    }
559    else
560    {
561      switch (p->avl.bal)
562      {
563        case -1:
564          p->avl.bal = 0;
565          modified = false;
566          break;
567
568        case 0:
569          p->avl.bal = 1;
570          break;
571
572        case 1:
573          p1 = p->avl.right;
574          if (p1->avl.bal == 1) /* simple RR-turn */
575          {
576            p->avl.right = p1->avl.left;
577            p1->avl.left = p;
578            p->avl.bal = 0;
579            p = p1;
580          }
581          else /* double RL-turn */
582          {
583            p2 = p1->avl.left;
584            p1->avl.left = p2->avl.right;
585            p2->avl.right = p1;
586            p->avl.right = p2->avl.left;
587            p2->avl.left = p;
588            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
589            if (p2->avl.bal == -1) p1->avl.bal = +1; else p1->avl.bal = 0;
590            p = p2;
591          }
592          p->avl.bal = 0;
593          modified = false;
594          break;
595
596        default:
597          break;
598      }
599    }
600    q = p;
601    if (buf_prev > buf_stack)
602    {
603      p = *--buf_prev;
604
605      if (p->avl.cache == -1)
606      {
607        p->avl.left = q;
608      }
609      else
610      {
611        p->avl.right = q;
612      }
613    }
614    else
615    {
616      *root = p;
617      break;
618    }
619  };
620
621  return 0;
622}
623
624
625/**
626 * Removes the node from the tree.
627 *
628 * @param root Pointer to pointer to the root node
629 * @param node Pointer to the node to remove
630 * @retval 0 Item removed
631 * @retval -1 No such item found
632 */
633static int
634rtems_bdbuf_avl_remove(rtems_bdbuf_buffer**      root,
635                       const rtems_bdbuf_buffer* node)
636{
637  const rtems_disk_device *dd = node->dd;
638  rtems_blkdev_bnum block = node->block;
639
640  rtems_bdbuf_buffer*  p = *root;
641  rtems_bdbuf_buffer*  q;
642  rtems_bdbuf_buffer*  r;
643  rtems_bdbuf_buffer*  s;
644  rtems_bdbuf_buffer*  p1;
645  rtems_bdbuf_buffer*  p2;
646  rtems_bdbuf_buffer*  buf_stack[RTEMS_BDBUF_AVL_MAX_HEIGHT];
647  rtems_bdbuf_buffer** buf_prev = buf_stack;
648
649  bool modified = false;
650
651  memset (buf_stack, 0, sizeof(buf_stack));
652
653  while (p != NULL)
654  {
655    *buf_prev++ = p;
656
657    if (((uintptr_t) p->dd < (uintptr_t) dd)
658        || ((p->dd == dd) && (p->block < block)))
659    {
660      p->avl.cache = 1;
661      p = p->avl.right;
662    }
663    else if ((p->dd != dd) || (p->block != block))
664    {
665      p->avl.cache = -1;
666      p = p->avl.left;
667    }
668    else
669    {
670      /* node found */
671      break;
672    }
673  }
674
675  if (p == NULL)
676  {
677    /* there is no such node */
678    return -1;
679  }
680
681  q = p;
682
683  buf_prev--;
684  if (buf_prev > buf_stack)
685  {
686    p = *(buf_prev - 1);
687  }
688  else
689  {
690    p = NULL;
691  }
692
693  /* at this moment q - is a node to delete, p is q's parent */
694  if (q->avl.right == NULL)
695  {
696    r = q->avl.left;
697    if (r != NULL)
698    {
699      r->avl.bal = 0;
700    }
701    q = r;
702  }
703  else
704  {
705    rtems_bdbuf_buffer **t;
706
707    r = q->avl.right;
708
709    if (r->avl.left == NULL)
710    {
711      r->avl.left = q->avl.left;
712      r->avl.bal = q->avl.bal;
713      r->avl.cache = 1;
714      *buf_prev++ = q = r;
715    }
716    else
717    {
718      t = buf_prev++;
719      s = r;
720
721      while (s->avl.left != NULL)
722      {
723        *buf_prev++ = r = s;
724        s = r->avl.left;
725        r->avl.cache = -1;
726      }
727
728      s->avl.left = q->avl.left;
729      r->avl.left = s->avl.right;
730      s->avl.right = q->avl.right;
731      s->avl.bal = q->avl.bal;
732      s->avl.cache = 1;
733
734      *t = q = s;
735    }
736  }
737
738  if (p != NULL)
739  {
740    if (p->avl.cache == -1)
741    {
742      p->avl.left = q;
743    }
744    else
745    {
746      p->avl.right = q;
747    }
748  }
749  else
750  {
751    *root = q;
752  }
753
754  modified = true;
755
756  while (modified)
757  {
758    if (buf_prev > buf_stack)
759    {
760      p = *--buf_prev;
761    }
762    else
763    {
764      break;
765    }
766
767    if (p->avl.cache == -1)
768    {
769      /* rebalance left branch */
770      switch (p->avl.bal)
771      {
772        case -1:
773          p->avl.bal = 0;
774          break;
775        case  0:
776          p->avl.bal = 1;
777          modified = false;
778          break;
779
780        case +1:
781          p1 = p->avl.right;
782
783          if (p1->avl.bal >= 0) /* simple RR-turn */
784          {
785            p->avl.right = p1->avl.left;
786            p1->avl.left = p;
787
788            if (p1->avl.bal == 0)
789            {
790              p1->avl.bal = -1;
791              modified = false;
792            }
793            else
794            {
795              p->avl.bal = 0;
796              p1->avl.bal = 0;
797            }
798            p = p1;
799          }
800          else /* double RL-turn */
801          {
802            p2 = p1->avl.left;
803
804            p1->avl.left = p2->avl.right;
805            p2->avl.right = p1;
806            p->avl.right = p2->avl.left;
807            p2->avl.left = p;
808
809            if (p2->avl.bal == +1) p->avl.bal = -1; else p->avl.bal = 0;
810            if (p2->avl.bal == -1) p1->avl.bal = 1; else p1->avl.bal = 0;
811
812            p = p2;
813            p2->avl.bal = 0;
814          }
815          break;
816
817        default:
818          break;
819      }
820    }
821    else
822    {
823      /* rebalance right branch */
824      switch (p->avl.bal)
825      {
826        case +1:
827          p->avl.bal = 0;
828          break;
829
830        case  0:
831          p->avl.bal = -1;
832          modified = false;
833          break;
834
835        case -1:
836          p1 = p->avl.left;
837
838          if (p1->avl.bal <= 0) /* simple LL-turn */
839          {
840            p->avl.left = p1->avl.right;
841            p1->avl.right = p;
842            if (p1->avl.bal == 0)
843            {
844              p1->avl.bal = 1;
845              modified = false;
846            }
847            else
848            {
849              p->avl.bal = 0;
850              p1->avl.bal = 0;
851            }
852            p = p1;
853          }
854          else /* double LR-turn */
855          {
856            p2 = p1->avl.right;
857
858            p1->avl.right = p2->avl.left;
859            p2->avl.left = p1;
860            p->avl.left = p2->avl.right;
861            p2->avl.right = p;
862
863            if (p2->avl.bal == -1) p->avl.bal = 1; else p->avl.bal = 0;
864            if (p2->avl.bal == +1) p1->avl.bal = -1; else p1->avl.bal = 0;
865
866            p = p2;
867            p2->avl.bal = 0;
868          }
869          break;
870
871        default:
872          break;
873      }
874    }
875
876    if (buf_prev > buf_stack)
877    {
878      q = *(buf_prev - 1);
879
880      if (q->avl.cache == -1)
881      {
882        q->avl.left = p;
883      }
884      else
885      {
886        q->avl.right = p;
887      }
888    }
889    else
890    {
891      *root = p;
892      break;
893    }
894
895  }
896
897  return 0;
898}
899
900static void
901rtems_bdbuf_set_state (rtems_bdbuf_buffer *bd, rtems_bdbuf_buf_state state)
902{
903  bd->state = state;
904}
905
906static rtems_blkdev_bnum
907rtems_bdbuf_media_block (const rtems_disk_device *dd, rtems_blkdev_bnum block)
908{
909  if (dd->block_to_media_block_shift >= 0)
910    return block << dd->block_to_media_block_shift;
911  else
912    /*
913     * Change the block number for the block size to the block number for the media
914     * block size. We have to use 64bit maths. There is no short cut here.
915     */
916    return (rtems_blkdev_bnum)
917      ((((uint64_t) block) * dd->block_size) / dd->media_block_size);
918}
919
920/**
921 * Lock the mutex. A single task can nest calls.
922 *
923 * @param lock The mutex to lock.
924 * @param fatal_error_code The error code if the call fails.
925 */
926static void
927rtems_bdbuf_lock (rtems_bdbuf_lock_type *lock, uint32_t fatal_error_code)
928{
929#if defined(RTEMS_BDBUF_USE_PTHREAD)
930  int eno = pthread_mutex_lock (lock);
931  if (eno != 0)
932    rtems_bdbuf_fatal (fatal_error_code);
933#else
934  rtems_status_code sc = rtems_semaphore_obtain (*lock,
935                                                 RTEMS_WAIT,
936                                                 RTEMS_NO_TIMEOUT);
937  if (sc != RTEMS_SUCCESSFUL)
938    rtems_bdbuf_fatal (fatal_error_code);
939#endif
940}
941
942/**
943 * Unlock the mutex.
944 *
945 * @param lock The mutex to unlock.
946 * @param fatal_error_code The error code if the call fails.
947 */
948static void
949rtems_bdbuf_unlock (rtems_bdbuf_lock_type *lock, uint32_t fatal_error_code)
950{
951#if defined(RTEMS_BDBUF_USE_PTHREAD)
952  int eno = pthread_mutex_unlock (lock);
953  if (eno != 0)
954    rtems_bdbuf_fatal (fatal_error_code);
955#else
956  rtems_status_code sc = rtems_semaphore_release (*lock);
957  if (sc != RTEMS_SUCCESSFUL)
958    rtems_bdbuf_fatal (fatal_error_code);
959#endif
960}
961
962/**
963 * Lock the cache. A single task can nest calls.
964 */
965static void
966rtems_bdbuf_lock_cache (void)
967{
968  rtems_bdbuf_lock (&bdbuf_cache.lock, RTEMS_BDBUF_FATAL_CACHE_LOCK);
969}
970
971/**
972 * Unlock the cache.
973 */
974static void
975rtems_bdbuf_unlock_cache (void)
976{
977  rtems_bdbuf_unlock (&bdbuf_cache.lock, RTEMS_BDBUF_FATAL_CACHE_UNLOCK);
978}
979
980/**
981 * Lock the cache's sync. A single task can nest calls.
982 */
983static void
984rtems_bdbuf_lock_sync (void)
985{
986  rtems_bdbuf_lock (&bdbuf_cache.sync_lock, RTEMS_BDBUF_FATAL_SYNC_LOCK);
987}
988
989/**
990 * Unlock the cache's sync lock. Any blocked writers are woken.
991 */
992static void
993rtems_bdbuf_unlock_sync (void)
994{
995  rtems_bdbuf_unlock (&bdbuf_cache.sync_lock,
996                      RTEMS_BDBUF_FATAL_SYNC_UNLOCK);
997}
998
999static void
1000rtems_bdbuf_group_obtain (rtems_bdbuf_buffer *bd)
1001{
1002  ++bd->group->users;
1003}
1004
1005static void
1006rtems_bdbuf_group_release (rtems_bdbuf_buffer *bd)
1007{
1008  --bd->group->users;
1009}
1010
1011#if !defined(RTEMS_BDBUF_USE_PTHREAD)
1012static rtems_mode
1013rtems_bdbuf_disable_preemption (void)
1014{
1015  rtems_status_code sc = RTEMS_SUCCESSFUL;
1016  rtems_mode prev_mode = 0;
1017
1018  sc = rtems_task_mode (RTEMS_NO_PREEMPT, RTEMS_PREEMPT_MASK, &prev_mode);
1019  if (sc != RTEMS_SUCCESSFUL)
1020    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_PREEMPT_DIS);
1021
1022  return prev_mode;
1023}
1024
1025static void
1026rtems_bdbuf_restore_preemption (rtems_mode prev_mode)
1027{
1028  rtems_status_code sc = RTEMS_SUCCESSFUL;
1029
1030  sc = rtems_task_mode (prev_mode, RTEMS_ALL_MODE_MASKS, &prev_mode);
1031  if (sc != RTEMS_SUCCESSFUL)
1032    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_PREEMPT_RST);
1033}
1034#endif
1035
1036/**
1037 * Wait until woken. Semaphores are used so a number of tasks can wait and can
1038 * be woken at once. Task events would require we maintain a list of tasks to
1039 * be woken and this would require storage and we do not know the number of
1040 * tasks that could be waiting.
1041 *
1042 * While we have the cache locked we can try and claim the semaphore and
1043 * therefore know when we release the lock to the cache we will block until the
1044 * semaphore is released. This may even happen before we get to block.
1045 *
1046 * A counter is used to save the release call when no one is waiting.
1047 *
1048 * The function assumes the cache is locked on entry and it will be locked on
1049 * exit.
1050 */
1051static void
1052rtems_bdbuf_anonymous_wait (rtems_bdbuf_waiters *waiters)
1053{
1054  /*
1055   * Indicate we are waiting.
1056   */
1057  ++waiters->count;
1058
1059#if defined(RTEMS_BDBUF_USE_PTHREAD)
1060  {
1061    int eno = pthread_cond_wait (&waiters->cond_var, &bdbuf_cache.lock);
1062    if (eno != 0)
1063      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_CV_WAIT);
1064  }
1065#else
1066  {
1067    rtems_status_code sc;
1068    rtems_mode        prev_mode;
1069
1070    /*
1071     * Disable preemption then unlock the cache and block.  There is no POSIX
1072     * condition variable in the core API so this is a work around.
1073     *
1074     * The issue is a task could preempt after the cache is unlocked because it is
1075     * blocking or just hits that window, and before this task has blocked on the
1076     * semaphore. If the preempting task flushes the queue this task will not see
1077     * the flush and may block for ever or until another transaction flushes this
1078     * semaphore.
1079     */
1080    prev_mode = rtems_bdbuf_disable_preemption();
1081
1082    /*
1083     * Unlock the cache, wait, and lock the cache when we return.
1084     */
1085    rtems_bdbuf_unlock_cache ();
1086
1087    sc = rtems_semaphore_obtain (waiters->sema, RTEMS_WAIT, RTEMS_BDBUF_WAIT_TIMEOUT);
1088
1089    if (sc == RTEMS_TIMEOUT)
1090      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_CACHE_WAIT_TO);
1091
1092    if (sc != RTEMS_UNSATISFIED)
1093      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_CACHE_WAIT_2);
1094
1095    rtems_bdbuf_lock_cache ();
1096
1097    rtems_bdbuf_restore_preemption (prev_mode);
1098  }
1099#endif
1100
1101  --waiters->count;
1102}
1103
1104static void
1105rtems_bdbuf_wait (rtems_bdbuf_buffer *bd, rtems_bdbuf_waiters *waiters)
1106{
1107  rtems_bdbuf_group_obtain (bd);
1108  ++bd->waiters;
1109  rtems_bdbuf_anonymous_wait (waiters);
1110  --bd->waiters;
1111  rtems_bdbuf_group_release (bd);
1112}
1113
1114/**
1115 * Wake a blocked resource. The resource has a counter that lets us know if
1116 * there are any waiters.
1117 */
1118static void
1119rtems_bdbuf_wake (rtems_bdbuf_waiters *waiters)
1120{
1121  if (waiters->count > 0)
1122  {
1123#if defined(RTEMS_BDBUF_USE_PTHREAD)
1124    int eno = pthread_cond_broadcast (&waiters->cond_var);
1125    if (eno != 0)
1126      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_CV_BROADCAST);
1127#else
1128    rtems_status_code sc = rtems_semaphore_flush (waiters->sema);
1129    if (sc != RTEMS_SUCCESSFUL)
1130      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_CACHE_WAKE);
1131#endif
1132  }
1133}
1134
1135static void
1136rtems_bdbuf_wake_swapper (void)
1137{
1138  rtems_status_code sc = rtems_event_send (bdbuf_cache.swapout,
1139                                           RTEMS_BDBUF_SWAPOUT_SYNC);
1140  if (sc != RTEMS_SUCCESSFUL)
1141    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_SO_WAKE_1);
1142}
1143
1144static bool
1145rtems_bdbuf_has_buffer_waiters (void)
1146{
1147  return bdbuf_cache.buffer_waiters.count;
1148}
1149
1150static void
1151rtems_bdbuf_remove_from_tree (rtems_bdbuf_buffer *bd)
1152{
1153  if (rtems_bdbuf_avl_remove (&bdbuf_cache.tree, bd) != 0)
1154    rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_TREE_RM);
1155}
1156
1157static void
1158rtems_bdbuf_remove_from_tree_and_lru_list (rtems_bdbuf_buffer *bd)
1159{
1160  switch (bd->state)
1161  {
1162    case RTEMS_BDBUF_STATE_FREE:
1163      break;
1164    case RTEMS_BDBUF_STATE_CACHED:
1165      rtems_bdbuf_remove_from_tree (bd);
1166      break;
1167    default:
1168      rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_10);
1169  }
1170
1171  rtems_chain_extract_unprotected (&bd->link);
1172}
1173
1174static void
1175rtems_bdbuf_make_free_and_add_to_lru_list (rtems_bdbuf_buffer *bd)
1176{
1177  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_FREE);
1178  rtems_chain_prepend_unprotected (&bdbuf_cache.lru, &bd->link);
1179}
1180
1181static void
1182rtems_bdbuf_make_empty (rtems_bdbuf_buffer *bd)
1183{
1184  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_EMPTY);
1185}
1186
1187static void
1188rtems_bdbuf_make_cached_and_add_to_lru_list (rtems_bdbuf_buffer *bd)
1189{
1190  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_CACHED);
1191  rtems_chain_append_unprotected (&bdbuf_cache.lru, &bd->link);
1192}
1193
1194static void
1195rtems_bdbuf_discard_buffer (rtems_bdbuf_buffer *bd)
1196{
1197  rtems_bdbuf_make_empty (bd);
1198
1199  if (bd->waiters == 0)
1200  {
1201    rtems_bdbuf_remove_from_tree (bd);
1202    rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1203  }
1204}
1205
1206static void
1207rtems_bdbuf_add_to_modified_list_after_access (rtems_bdbuf_buffer *bd)
1208{
1209  if (bdbuf_cache.sync_active && bdbuf_cache.sync_device == bd->dd)
1210  {
1211    rtems_bdbuf_unlock_cache ();
1212
1213    /*
1214     * Wait for the sync lock.
1215     */
1216    rtems_bdbuf_lock_sync ();
1217
1218    rtems_bdbuf_unlock_sync ();
1219    rtems_bdbuf_lock_cache ();
1220  }
1221
1222  /*
1223   * Only the first modified release sets the timer and any further user
1224   * accesses do not change the timer value which should move down. This
1225   * assumes the user's hold of the buffer is much less than the time on the
1226   * modified list. Resetting the timer on each access which could result in a
1227   * buffer never getting to 0 and never being forced onto disk. This raises a
1228   * difficult question. Is a snapshot of a block that is changing better than
1229   * nothing being written? We have tended to think we should hold changes for
1230   * only a specific period of time even if still changing and get onto disk
1231   * and letting the file system try and recover this position if it can.
1232   */
1233  if (bd->state == RTEMS_BDBUF_STATE_ACCESS_CACHED
1234        || bd->state == RTEMS_BDBUF_STATE_ACCESS_EMPTY)
1235    bd->hold_timer = bdbuf_config.swap_block_hold;
1236
1237  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_MODIFIED);
1238  rtems_chain_append_unprotected (&bdbuf_cache.modified, &bd->link);
1239
1240  if (bd->waiters)
1241    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1242  else if (rtems_bdbuf_has_buffer_waiters ())
1243    rtems_bdbuf_wake_swapper ();
1244}
1245
1246static void
1247rtems_bdbuf_add_to_lru_list_after_access (rtems_bdbuf_buffer *bd)
1248{
1249  rtems_bdbuf_group_release (bd);
1250  rtems_bdbuf_make_cached_and_add_to_lru_list (bd);
1251
1252  if (bd->waiters)
1253    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1254  else
1255    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1256}
1257
1258/**
1259 * Compute the number of BDs per group for a given buffer size.
1260 *
1261 * @param size The buffer size. It can be any size and we scale up.
1262 */
1263static size_t
1264rtems_bdbuf_bds_per_group (size_t size)
1265{
1266  size_t bufs_per_size;
1267  size_t bds_per_size;
1268
1269  if (size > bdbuf_config.buffer_max)
1270    return 0;
1271
1272  bufs_per_size = ((size - 1) / bdbuf_config.buffer_min) + 1;
1273
1274  for (bds_per_size = 1;
1275       bds_per_size < bufs_per_size;
1276       bds_per_size <<= 1)
1277    ;
1278
1279  return bdbuf_cache.max_bds_per_group / bds_per_size;
1280}
1281
1282static void
1283rtems_bdbuf_discard_buffer_after_access (rtems_bdbuf_buffer *bd)
1284{
1285  rtems_bdbuf_group_release (bd);
1286  rtems_bdbuf_discard_buffer (bd);
1287
1288  if (bd->waiters)
1289    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1290  else
1291    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1292}
1293
1294/**
1295 * Reallocate a group. The BDs currently allocated in the group are removed
1296 * from the ALV tree and any lists then the new BD's are prepended to the ready
1297 * list of the cache.
1298 *
1299 * @param group The group to reallocate.
1300 * @param new_bds_per_group The new count of BDs per group.
1301 * @return A buffer of this group.
1302 */
1303static rtems_bdbuf_buffer *
1304rtems_bdbuf_group_realloc (rtems_bdbuf_group* group, size_t new_bds_per_group)
1305{
1306  rtems_bdbuf_buffer* bd;
1307  size_t              b;
1308  size_t              bufs_per_bd;
1309
1310  if (rtems_bdbuf_tracer)
1311    printf ("bdbuf:realloc: %tu: %zd -> %zd\n",
1312            group - bdbuf_cache.groups, group->bds_per_group,
1313            new_bds_per_group);
1314
1315  bufs_per_bd = bdbuf_cache.max_bds_per_group / group->bds_per_group;
1316
1317  for (b = 0, bd = group->bdbuf;
1318       b < group->bds_per_group;
1319       b++, bd += bufs_per_bd)
1320    rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1321
1322  group->bds_per_group = new_bds_per_group;
1323  bufs_per_bd = bdbuf_cache.max_bds_per_group / new_bds_per_group;
1324
1325  for (b = 1, bd = group->bdbuf + bufs_per_bd;
1326       b < group->bds_per_group;
1327       b++, bd += bufs_per_bd)
1328    rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1329
1330  if (b > 1)
1331    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1332
1333  return group->bdbuf;
1334}
1335
1336static void
1337rtems_bdbuf_setup_empty_buffer (rtems_bdbuf_buffer *bd,
1338                                rtems_disk_device  *dd,
1339                                rtems_blkdev_bnum   block)
1340{
1341  bd->dd        = dd ;
1342  bd->block     = block;
1343  bd->avl.left  = NULL;
1344  bd->avl.right = NULL;
1345  bd->waiters   = 0;
1346
1347  if (rtems_bdbuf_avl_insert (&bdbuf_cache.tree, bd) != 0)
1348    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_RECYCLE);
1349
1350  rtems_bdbuf_make_empty (bd);
1351}
1352
1353static rtems_bdbuf_buffer *
1354rtems_bdbuf_get_buffer_from_lru_list (rtems_disk_device *dd,
1355                                      rtems_blkdev_bnum  block)
1356{
1357  rtems_chain_node *node = rtems_chain_first (&bdbuf_cache.lru);
1358
1359  while (!rtems_chain_is_tail (&bdbuf_cache.lru, node))
1360  {
1361    rtems_bdbuf_buffer *bd = (rtems_bdbuf_buffer *) node;
1362    rtems_bdbuf_buffer *empty_bd = NULL;
1363
1364    if (rtems_bdbuf_tracer)
1365      printf ("bdbuf:next-bd: %tu (%td:%" PRId32 ") %zd -> %zd\n",
1366              bd - bdbuf_cache.bds,
1367              bd->group - bdbuf_cache.groups, bd->group->users,
1368              bd->group->bds_per_group, dd->bds_per_group);
1369
1370    /*
1371     * If nobody waits for this BD, we may recycle it.
1372     */
1373    if (bd->waiters == 0)
1374    {
1375      if (bd->group->bds_per_group == dd->bds_per_group)
1376      {
1377        rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1378
1379        empty_bd = bd;
1380      }
1381      else if (bd->group->users == 0)
1382        empty_bd = rtems_bdbuf_group_realloc (bd->group, dd->bds_per_group);
1383    }
1384
1385    if (empty_bd != NULL)
1386    {
1387      rtems_bdbuf_setup_empty_buffer (empty_bd, dd, block);
1388
1389      return empty_bd;
1390    }
1391
1392    node = rtems_chain_next (node);
1393  }
1394
1395  return NULL;
1396}
1397
1398static rtems_status_code
1399rtems_bdbuf_create_task(
1400  rtems_name name,
1401  rtems_task_priority priority,
1402  rtems_task_priority default_priority,
1403  rtems_id *id
1404)
1405{
1406  rtems_status_code sc;
1407  size_t stack_size = bdbuf_config.task_stack_size ?
1408    bdbuf_config.task_stack_size : RTEMS_BDBUF_TASK_STACK_SIZE_DEFAULT;
1409
1410  priority = priority != 0 ? priority : default_priority;
1411
1412  sc = rtems_task_create (name,
1413                          priority,
1414                          stack_size,
1415                          RTEMS_PREEMPT | RTEMS_NO_TIMESLICE | RTEMS_NO_ASR,
1416                          RTEMS_LOCAL | RTEMS_NO_FLOATING_POINT,
1417                          id);
1418
1419  return sc;
1420}
1421
1422static rtems_bdbuf_swapout_transfer*
1423rtems_bdbuf_swapout_transfer_alloc (void)
1424{
1425  /*
1426   * @note chrisj The rtems_blkdev_request and the array at the end is a hack.
1427   * I am disappointment at finding code like this in RTEMS. The request should
1428   * have been a rtems_chain_control. Simple, fast and less storage as the node
1429   * is already part of the buffer structure.
1430   */
1431  size_t transfer_size = sizeof (rtems_bdbuf_swapout_transfer)
1432    + (bdbuf_config.max_write_blocks * sizeof (rtems_blkdev_sg_buffer));
1433  return calloc (1, transfer_size);
1434}
1435
1436static void
1437rtems_bdbuf_transfer_done (rtems_blkdev_request* req, rtems_status_code status);
1438
1439static void
1440rtems_bdbuf_swapout_transfer_init (rtems_bdbuf_swapout_transfer* transfer,
1441                                   rtems_id id)
1442{
1443  rtems_chain_initialize_empty (&transfer->bds);
1444  transfer->dd = BDBUF_INVALID_DEV;
1445  transfer->syncing = false;
1446  transfer->write_req.req = RTEMS_BLKDEV_REQ_WRITE;
1447  transfer->write_req.done = rtems_bdbuf_transfer_done;
1448  transfer->write_req.io_task = id;
1449}
1450
1451static size_t
1452rtems_bdbuf_swapout_worker_size (void)
1453{
1454  return sizeof (rtems_bdbuf_swapout_worker)
1455    + (bdbuf_config.max_write_blocks * sizeof (rtems_blkdev_sg_buffer));
1456}
1457
1458static rtems_task
1459rtems_bdbuf_swapout_worker_task (rtems_task_argument arg);
1460
1461static rtems_status_code
1462rtems_bdbuf_swapout_workers_create (void)
1463{
1464  rtems_status_code  sc;
1465  size_t             w;
1466  size_t             worker_size;
1467  char              *worker_current;
1468
1469  worker_size = rtems_bdbuf_swapout_worker_size ();
1470  worker_current = calloc (1, bdbuf_config.swapout_workers * worker_size);
1471  sc = worker_current != NULL ? RTEMS_SUCCESSFUL : RTEMS_NO_MEMORY;
1472
1473  bdbuf_cache.swapout_workers = (rtems_bdbuf_swapout_worker *) worker_current;
1474
1475  for (w = 0;
1476       sc == RTEMS_SUCCESSFUL && w < bdbuf_config.swapout_workers;
1477       w++, worker_current += worker_size)
1478  {
1479    rtems_bdbuf_swapout_worker *worker = (rtems_bdbuf_swapout_worker *) worker_current;
1480
1481    sc = rtems_bdbuf_create_task (rtems_build_name('B', 'D', 'o', 'a' + w),
1482                                  bdbuf_config.swapout_worker_priority,
1483                                  RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT,
1484                                  &worker->id);
1485    if (sc == RTEMS_SUCCESSFUL)
1486    {
1487      rtems_bdbuf_swapout_transfer_init (&worker->transfer, worker->id);
1488
1489      rtems_chain_append_unprotected (&bdbuf_cache.swapout_free_workers, &worker->link);
1490      worker->enabled = true;
1491
1492      sc = rtems_task_start (worker->id,
1493                             rtems_bdbuf_swapout_worker_task,
1494                             (rtems_task_argument) worker);
1495    }
1496  }
1497
1498  return sc;
1499}
1500
1501static size_t
1502rtems_bdbuf_read_request_size (uint32_t transfer_count)
1503{
1504  return sizeof (rtems_blkdev_request)
1505    + sizeof (rtems_blkdev_sg_buffer) * transfer_count;
1506}
1507
1508static rtems_status_code
1509rtems_bdbuf_do_init (void)
1510{
1511  rtems_bdbuf_group*  group;
1512  rtems_bdbuf_buffer* bd;
1513  uint8_t*            buffer;
1514  size_t              b;
1515  rtems_status_code   sc;
1516
1517  if (rtems_bdbuf_tracer)
1518    printf ("bdbuf:init\n");
1519
1520  if (rtems_interrupt_is_in_progress())
1521    return RTEMS_CALLED_FROM_ISR;
1522
1523  /*
1524   * Check the configuration table values.
1525   */
1526
1527  if ((bdbuf_config.buffer_max % bdbuf_config.buffer_min) != 0)
1528    return RTEMS_INVALID_NUMBER;
1529
1530  if (rtems_bdbuf_read_request_size (bdbuf_config.max_read_ahead_blocks)
1531      > RTEMS_MINIMUM_STACK_SIZE / 8U)
1532    return RTEMS_INVALID_NUMBER;
1533
1534  bdbuf_cache.sync_device = BDBUF_INVALID_DEV;
1535
1536  rtems_chain_initialize_empty (&bdbuf_cache.swapout_free_workers);
1537  rtems_chain_initialize_empty (&bdbuf_cache.lru);
1538  rtems_chain_initialize_empty (&bdbuf_cache.modified);
1539  rtems_chain_initialize_empty (&bdbuf_cache.sync);
1540  rtems_chain_initialize_empty (&bdbuf_cache.read_ahead_chain);
1541
1542  /*
1543   * Create the locks for the cache.
1544   */
1545
1546  sc = rtems_bdbuf_lock_create (rtems_build_name ('B', 'D', 'C', 'l'),
1547                                &bdbuf_cache.lock);
1548  if (sc != RTEMS_SUCCESSFUL)
1549    goto error;
1550
1551  rtems_bdbuf_lock_cache ();
1552
1553  sc = rtems_bdbuf_lock_create (rtems_build_name ('B', 'D', 'C', 's'),
1554                                &bdbuf_cache.sync_lock);
1555  if (sc != RTEMS_SUCCESSFUL)
1556    goto error;
1557
1558  sc = rtems_bdbuf_waiter_create (rtems_build_name ('B', 'D', 'C', 'a'),
1559                                  &bdbuf_cache.access_waiters);
1560  if (sc != RTEMS_SUCCESSFUL)
1561    goto error;
1562
1563  sc = rtems_bdbuf_waiter_create (rtems_build_name ('B', 'D', 'C', 't'),
1564                                  &bdbuf_cache.transfer_waiters);
1565  if (sc != RTEMS_SUCCESSFUL)
1566    goto error;
1567
1568  sc = rtems_bdbuf_waiter_create (rtems_build_name ('B', 'D', 'C', 'b'),
1569                                  &bdbuf_cache.buffer_waiters);
1570  if (sc != RTEMS_SUCCESSFUL)
1571    goto error;
1572
1573  /*
1574   * Compute the various number of elements in the cache.
1575   */
1576  bdbuf_cache.buffer_min_count =
1577    bdbuf_config.size / bdbuf_config.buffer_min;
1578  bdbuf_cache.max_bds_per_group =
1579    bdbuf_config.buffer_max / bdbuf_config.buffer_min;
1580  bdbuf_cache.group_count =
1581    bdbuf_cache.buffer_min_count / bdbuf_cache.max_bds_per_group;
1582
1583  /*
1584   * Allocate the memory for the buffer descriptors.
1585   */
1586  bdbuf_cache.bds = calloc (sizeof (rtems_bdbuf_buffer),
1587                            bdbuf_cache.buffer_min_count);
1588  if (!bdbuf_cache.bds)
1589    goto error;
1590
1591  /*
1592   * Allocate the memory for the buffer descriptors.
1593   */
1594  bdbuf_cache.groups = calloc (sizeof (rtems_bdbuf_group),
1595                               bdbuf_cache.group_count);
1596  if (!bdbuf_cache.groups)
1597    goto error;
1598
1599  /*
1600   * Allocate memory for buffer memory. The buffer memory will be cache
1601   * aligned. It is possible to free the memory allocated by
1602   * rtems_cache_aligned_malloc() with free().
1603   */
1604  bdbuf_cache.buffers = rtems_cache_aligned_malloc(bdbuf_cache.buffer_min_count
1605                                                   * bdbuf_config.buffer_min);
1606  if (bdbuf_cache.buffers == NULL)
1607    goto error;
1608
1609  /*
1610   * The cache is empty after opening so we need to add all the buffers to it
1611   * and initialise the groups.
1612   */
1613  for (b = 0, group = bdbuf_cache.groups,
1614         bd = bdbuf_cache.bds, buffer = bdbuf_cache.buffers;
1615       b < bdbuf_cache.buffer_min_count;
1616       b++, bd++, buffer += bdbuf_config.buffer_min)
1617  {
1618    bd->dd    = BDBUF_INVALID_DEV;
1619    bd->group  = group;
1620    bd->buffer = buffer;
1621
1622    rtems_chain_append_unprotected (&bdbuf_cache.lru, &bd->link);
1623
1624    if ((b % bdbuf_cache.max_bds_per_group) ==
1625        (bdbuf_cache.max_bds_per_group - 1))
1626      group++;
1627  }
1628
1629  for (b = 0,
1630         group = bdbuf_cache.groups,
1631         bd = bdbuf_cache.bds;
1632       b < bdbuf_cache.group_count;
1633       b++,
1634         group++,
1635         bd += bdbuf_cache.max_bds_per_group)
1636  {
1637    group->bds_per_group = bdbuf_cache.max_bds_per_group;
1638    group->bdbuf = bd;
1639  }
1640
1641  /*
1642   * Create and start swapout task.
1643   */
1644
1645  bdbuf_cache.swapout_transfer = rtems_bdbuf_swapout_transfer_alloc ();
1646  if (!bdbuf_cache.swapout_transfer)
1647    goto error;
1648
1649  bdbuf_cache.swapout_enabled = true;
1650
1651  sc = rtems_bdbuf_create_task (rtems_build_name('B', 'S', 'W', 'P'),
1652                                bdbuf_config.swapout_priority,
1653                                RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT,
1654                                &bdbuf_cache.swapout);
1655  if (sc != RTEMS_SUCCESSFUL)
1656    goto error;
1657
1658  rtems_bdbuf_swapout_transfer_init (bdbuf_cache.swapout_transfer, bdbuf_cache.swapout);
1659
1660  sc = rtems_task_start (bdbuf_cache.swapout,
1661                         rtems_bdbuf_swapout_task,
1662                         (rtems_task_argument) bdbuf_cache.swapout_transfer);
1663  if (sc != RTEMS_SUCCESSFUL)
1664    goto error;
1665
1666  if (bdbuf_config.swapout_workers > 0)
1667  {
1668    sc = rtems_bdbuf_swapout_workers_create ();
1669    if (sc != RTEMS_SUCCESSFUL)
1670      goto error;
1671  }
1672
1673  if (bdbuf_config.max_read_ahead_blocks > 0)
1674  {
1675    bdbuf_cache.read_ahead_enabled = true;
1676    sc = rtems_bdbuf_create_task (rtems_build_name('B', 'R', 'D', 'A'),
1677                                  bdbuf_config.read_ahead_priority,
1678                                  RTEMS_BDBUF_READ_AHEAD_TASK_PRIORITY_DEFAULT,
1679                                  &bdbuf_cache.read_ahead_task);
1680    if (sc != RTEMS_SUCCESSFUL)
1681      goto error;
1682
1683    sc = rtems_task_start (bdbuf_cache.read_ahead_task,
1684                           rtems_bdbuf_read_ahead_task,
1685                           0);
1686    if (sc != RTEMS_SUCCESSFUL)
1687      goto error;
1688  }
1689
1690  rtems_bdbuf_unlock_cache ();
1691
1692  return RTEMS_SUCCESSFUL;
1693
1694error:
1695
1696  if (bdbuf_cache.read_ahead_task != 0)
1697    rtems_task_delete (bdbuf_cache.read_ahead_task);
1698
1699  if (bdbuf_cache.swapout != 0)
1700    rtems_task_delete (bdbuf_cache.swapout);
1701
1702  if (bdbuf_cache.swapout_workers)
1703  {
1704    char   *worker_current = (char *) bdbuf_cache.swapout_workers;
1705    size_t  worker_size = rtems_bdbuf_swapout_worker_size ();
1706    size_t  w;
1707
1708    for (w = 0;
1709         w < bdbuf_config.swapout_workers;
1710         w++, worker_current += worker_size)
1711    {
1712      rtems_bdbuf_swapout_worker *worker = (rtems_bdbuf_swapout_worker *) worker_current;
1713
1714      if (worker->id != 0) {
1715        rtems_task_delete (worker->id);
1716      }
1717    }
1718  }
1719
1720  free (bdbuf_cache.buffers);
1721  free (bdbuf_cache.groups);
1722  free (bdbuf_cache.bds);
1723  free (bdbuf_cache.swapout_transfer);
1724  free (bdbuf_cache.swapout_workers);
1725
1726  rtems_bdbuf_waiter_delete (&bdbuf_cache.buffer_waiters);
1727  rtems_bdbuf_waiter_delete (&bdbuf_cache.access_waiters);
1728  rtems_bdbuf_waiter_delete (&bdbuf_cache.transfer_waiters);
1729  rtems_bdbuf_lock_delete (&bdbuf_cache.sync_lock);
1730
1731  if (bdbuf_cache.lock != 0)
1732  {
1733    rtems_bdbuf_unlock_cache ();
1734    rtems_bdbuf_lock_delete (&bdbuf_cache.lock);
1735  }
1736
1737  return RTEMS_UNSATISFIED;
1738}
1739
1740static void
1741rtems_bdbuf_init_once (void)
1742{
1743  bdbuf_cache.init_status = rtems_bdbuf_do_init();
1744}
1745
1746rtems_status_code
1747rtems_bdbuf_init (void)
1748{
1749  int eno = pthread_once (&rtems_bdbuf_once_state, rtems_bdbuf_init_once);
1750
1751  if (eno != 0)
1752    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_ONCE);
1753
1754  return bdbuf_cache.init_status;
1755}
1756
1757static void
1758rtems_bdbuf_wait_for_event (rtems_event_set event)
1759{
1760  rtems_status_code sc = RTEMS_SUCCESSFUL;
1761  rtems_event_set   out = 0;
1762
1763  sc = rtems_event_receive (event,
1764                            RTEMS_EVENT_ALL | RTEMS_WAIT,
1765                            RTEMS_NO_TIMEOUT,
1766                            &out);
1767
1768  if (sc != RTEMS_SUCCESSFUL || out != event)
1769    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_WAIT_EVNT);
1770}
1771
1772static void
1773rtems_bdbuf_wait_for_transient_event (void)
1774{
1775  rtems_status_code sc = RTEMS_SUCCESSFUL;
1776
1777  sc = rtems_event_transient_receive (RTEMS_WAIT, RTEMS_NO_TIMEOUT);
1778  if (sc != RTEMS_SUCCESSFUL)
1779    rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_WAIT_TRANS_EVNT);
1780}
1781
1782static void
1783rtems_bdbuf_wait_for_access (rtems_bdbuf_buffer *bd)
1784{
1785  while (true)
1786  {
1787    switch (bd->state)
1788    {
1789      case RTEMS_BDBUF_STATE_MODIFIED:
1790        rtems_bdbuf_group_release (bd);
1791        /* Fall through */
1792      case RTEMS_BDBUF_STATE_CACHED:
1793        rtems_chain_extract_unprotected (&bd->link);
1794        /* Fall through */
1795      case RTEMS_BDBUF_STATE_EMPTY:
1796        return;
1797      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1798      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1799      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1800      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1801        rtems_bdbuf_wait (bd, &bdbuf_cache.access_waiters);
1802        break;
1803      case RTEMS_BDBUF_STATE_SYNC:
1804      case RTEMS_BDBUF_STATE_TRANSFER:
1805      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1806        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1807        break;
1808      default:
1809        rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_7);
1810    }
1811  }
1812}
1813
1814static void
1815rtems_bdbuf_request_sync_for_modified_buffer (rtems_bdbuf_buffer *bd)
1816{
1817  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_SYNC);
1818  rtems_chain_extract_unprotected (&bd->link);
1819  rtems_chain_append_unprotected (&bdbuf_cache.sync, &bd->link);
1820  rtems_bdbuf_wake_swapper ();
1821}
1822
1823/**
1824 * @brief Waits until the buffer is ready for recycling.
1825 *
1826 * @retval @c true Buffer is valid and may be recycled.
1827 * @retval @c false Buffer is invalid and has to searched again.
1828 */
1829static bool
1830rtems_bdbuf_wait_for_recycle (rtems_bdbuf_buffer *bd)
1831{
1832  while (true)
1833  {
1834    switch (bd->state)
1835    {
1836      case RTEMS_BDBUF_STATE_FREE:
1837        return true;
1838      case RTEMS_BDBUF_STATE_MODIFIED:
1839        rtems_bdbuf_request_sync_for_modified_buffer (bd);
1840        break;
1841      case RTEMS_BDBUF_STATE_CACHED:
1842      case RTEMS_BDBUF_STATE_EMPTY:
1843        if (bd->waiters == 0)
1844          return true;
1845        else
1846        {
1847          /*
1848           * It is essential that we wait here without a special wait count and
1849           * without the group in use.  Otherwise we could trigger a wait ping
1850           * pong with another recycle waiter.  The state of the buffer is
1851           * arbitrary afterwards.
1852           */
1853          rtems_bdbuf_anonymous_wait (&bdbuf_cache.buffer_waiters);
1854          return false;
1855        }
1856      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1857      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1858      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1859      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1860        rtems_bdbuf_wait (bd, &bdbuf_cache.access_waiters);
1861        break;
1862      case RTEMS_BDBUF_STATE_SYNC:
1863      case RTEMS_BDBUF_STATE_TRANSFER:
1864      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1865        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1866        break;
1867      default:
1868        rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_8);
1869    }
1870  }
1871}
1872
1873static void
1874rtems_bdbuf_wait_for_sync_done (rtems_bdbuf_buffer *bd)
1875{
1876  while (true)
1877  {
1878    switch (bd->state)
1879    {
1880      case RTEMS_BDBUF_STATE_CACHED:
1881      case RTEMS_BDBUF_STATE_EMPTY:
1882      case RTEMS_BDBUF_STATE_MODIFIED:
1883      case RTEMS_BDBUF_STATE_ACCESS_CACHED:
1884      case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
1885      case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
1886      case RTEMS_BDBUF_STATE_ACCESS_PURGED:
1887        return;
1888      case RTEMS_BDBUF_STATE_SYNC:
1889      case RTEMS_BDBUF_STATE_TRANSFER:
1890      case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
1891        rtems_bdbuf_wait (bd, &bdbuf_cache.transfer_waiters);
1892        break;
1893      default:
1894        rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_9);
1895    }
1896  }
1897}
1898
1899static void
1900rtems_bdbuf_wait_for_buffer (void)
1901{
1902  if (!rtems_chain_is_empty (&bdbuf_cache.modified))
1903    rtems_bdbuf_wake_swapper ();
1904
1905  rtems_bdbuf_anonymous_wait (&bdbuf_cache.buffer_waiters);
1906}
1907
1908static void
1909rtems_bdbuf_sync_after_access (rtems_bdbuf_buffer *bd)
1910{
1911  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_SYNC);
1912
1913  rtems_chain_append_unprotected (&bdbuf_cache.sync, &bd->link);
1914
1915  if (bd->waiters)
1916    rtems_bdbuf_wake (&bdbuf_cache.access_waiters);
1917
1918  rtems_bdbuf_wake_swapper ();
1919  rtems_bdbuf_wait_for_sync_done (bd);
1920
1921  /*
1922   * We may have created a cached or empty buffer which may be recycled.
1923   */
1924  if (bd->waiters == 0
1925        && (bd->state == RTEMS_BDBUF_STATE_CACHED
1926          || bd->state == RTEMS_BDBUF_STATE_EMPTY))
1927  {
1928    if (bd->state == RTEMS_BDBUF_STATE_EMPTY)
1929    {
1930      rtems_bdbuf_remove_from_tree (bd);
1931      rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1932    }
1933    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1934  }
1935}
1936
1937static rtems_bdbuf_buffer *
1938rtems_bdbuf_get_buffer_for_read_ahead (rtems_disk_device *dd,
1939                                       rtems_blkdev_bnum  block)
1940{
1941  rtems_bdbuf_buffer *bd = NULL;
1942
1943  bd = rtems_bdbuf_avl_search (&bdbuf_cache.tree, dd, block);
1944
1945  if (bd == NULL)
1946  {
1947    bd = rtems_bdbuf_get_buffer_from_lru_list (dd, block);
1948
1949    if (bd != NULL)
1950      rtems_bdbuf_group_obtain (bd);
1951  }
1952  else
1953    /*
1954     * The buffer is in the cache.  So it is already available or in use, and
1955     * thus no need for a read ahead.
1956     */
1957    bd = NULL;
1958
1959  return bd;
1960}
1961
1962static rtems_bdbuf_buffer *
1963rtems_bdbuf_get_buffer_for_access (rtems_disk_device *dd,
1964                                   rtems_blkdev_bnum  block)
1965{
1966  rtems_bdbuf_buffer *bd = NULL;
1967
1968  do
1969  {
1970    bd = rtems_bdbuf_avl_search (&bdbuf_cache.tree, dd, block);
1971
1972    if (bd != NULL)
1973    {
1974      if (bd->group->bds_per_group != dd->bds_per_group)
1975      {
1976        if (rtems_bdbuf_wait_for_recycle (bd))
1977        {
1978          rtems_bdbuf_remove_from_tree_and_lru_list (bd);
1979          rtems_bdbuf_make_free_and_add_to_lru_list (bd);
1980          rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
1981        }
1982        bd = NULL;
1983      }
1984    }
1985    else
1986    {
1987      bd = rtems_bdbuf_get_buffer_from_lru_list (dd, block);
1988
1989      if (bd == NULL)
1990        rtems_bdbuf_wait_for_buffer ();
1991    }
1992  }
1993  while (bd == NULL);
1994
1995  rtems_bdbuf_wait_for_access (bd);
1996  rtems_bdbuf_group_obtain (bd);
1997
1998  return bd;
1999}
2000
2001static rtems_status_code
2002rtems_bdbuf_get_media_block (const rtems_disk_device *dd,
2003                             rtems_blkdev_bnum        block,
2004                             rtems_blkdev_bnum       *media_block_ptr)
2005{
2006  rtems_status_code sc = RTEMS_SUCCESSFUL;
2007
2008  if (block < dd->block_count)
2009  {
2010    /*
2011     * Compute the media block number. Drivers work with media block number not
2012     * the block number a BD may have as this depends on the block size set by
2013     * the user.
2014     */
2015    *media_block_ptr = rtems_bdbuf_media_block (dd, block) + dd->start;
2016  }
2017  else
2018  {
2019    sc = RTEMS_INVALID_ID;
2020  }
2021
2022  return sc;
2023}
2024
2025rtems_status_code
2026rtems_bdbuf_get (rtems_disk_device   *dd,
2027                 rtems_blkdev_bnum    block,
2028                 rtems_bdbuf_buffer **bd_ptr)
2029{
2030  rtems_status_code   sc = RTEMS_SUCCESSFUL;
2031  rtems_bdbuf_buffer *bd = NULL;
2032  rtems_blkdev_bnum   media_block;
2033
2034  rtems_bdbuf_lock_cache ();
2035
2036  sc = rtems_bdbuf_get_media_block (dd, block, &media_block);
2037  if (sc == RTEMS_SUCCESSFUL)
2038  {
2039    /*
2040     * Print the block index relative to the physical disk.
2041     */
2042    if (rtems_bdbuf_tracer)
2043      printf ("bdbuf:get: %" PRIu32 " (%" PRIu32 ") (dev = %08x)\n",
2044              media_block, block, (unsigned) dd->dev);
2045
2046    bd = rtems_bdbuf_get_buffer_for_access (dd, media_block);
2047
2048    switch (bd->state)
2049    {
2050      case RTEMS_BDBUF_STATE_CACHED:
2051        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
2052        break;
2053      case RTEMS_BDBUF_STATE_EMPTY:
2054        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_EMPTY);
2055        break;
2056      case RTEMS_BDBUF_STATE_MODIFIED:
2057        /*
2058         * To get a modified buffer could be considered a bug in the caller
2059         * because you should not be getting an already modified buffer but
2060         * user may have modified a byte in a block then decided to seek the
2061         * start and write the whole block and the file system will have no
2062         * record of this so just gets the block to fill.
2063         */
2064        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_MODIFIED);
2065        break;
2066      default:
2067        rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_2);
2068        break;
2069    }
2070
2071    if (rtems_bdbuf_tracer)
2072    {
2073      rtems_bdbuf_show_users ("get", bd);
2074      rtems_bdbuf_show_usage ();
2075    }
2076  }
2077
2078  rtems_bdbuf_unlock_cache ();
2079
2080  *bd_ptr = bd;
2081
2082  return sc;
2083}
2084
2085/**
2086 * Call back handler called by the low level driver when the transfer has
2087 * completed. This function may be invoked from interrupt handler.
2088 *
2089 * @param arg Arbitrary argument specified in block device request
2090 *            structure (in this case - pointer to the appropriate
2091 *            block device request structure).
2092 * @param status I/O completion status
2093 */
2094static void
2095rtems_bdbuf_transfer_done (rtems_blkdev_request* req, rtems_status_code status)
2096{
2097  req->status = status;
2098
2099  rtems_event_transient_send (req->io_task);
2100}
2101
2102static rtems_status_code
2103rtems_bdbuf_execute_transfer_request (rtems_disk_device    *dd,
2104                                      rtems_blkdev_request *req,
2105                                      bool                  cache_locked)
2106{
2107  rtems_status_code sc = RTEMS_SUCCESSFUL;
2108  uint32_t transfer_index = 0;
2109  bool wake_transfer_waiters = false;
2110  bool wake_buffer_waiters = false;
2111
2112  if (cache_locked)
2113    rtems_bdbuf_unlock_cache ();
2114
2115  /* The return value will be ignored for transfer requests */
2116  dd->ioctl (dd->phys_dev, RTEMS_BLKIO_REQUEST, req);
2117
2118  /* Wait for transfer request completion */
2119  rtems_bdbuf_wait_for_transient_event ();
2120  sc = req->status;
2121
2122  rtems_bdbuf_lock_cache ();
2123
2124  /* Statistics */
2125  if (req->req == RTEMS_BLKDEV_REQ_READ)
2126  {
2127    dd->stats.read_blocks += req->bufnum;
2128    if (sc != RTEMS_SUCCESSFUL)
2129      ++dd->stats.read_errors;
2130  }
2131  else
2132  {
2133    dd->stats.write_blocks += req->bufnum;
2134    ++dd->stats.write_transfers;
2135    if (sc != RTEMS_SUCCESSFUL)
2136      ++dd->stats.write_errors;
2137  }
2138
2139  for (transfer_index = 0; transfer_index < req->bufnum; ++transfer_index)
2140  {
2141    rtems_bdbuf_buffer *bd = req->bufs [transfer_index].user;
2142    bool waiters = bd->waiters;
2143
2144    if (waiters)
2145      wake_transfer_waiters = true;
2146    else
2147      wake_buffer_waiters = true;
2148
2149    rtems_bdbuf_group_release (bd);
2150
2151    if (sc == RTEMS_SUCCESSFUL && bd->state == RTEMS_BDBUF_STATE_TRANSFER)
2152      rtems_bdbuf_make_cached_and_add_to_lru_list (bd);
2153    else
2154      rtems_bdbuf_discard_buffer (bd);
2155
2156    if (rtems_bdbuf_tracer)
2157      rtems_bdbuf_show_users ("transfer", bd);
2158  }
2159
2160  if (wake_transfer_waiters)
2161    rtems_bdbuf_wake (&bdbuf_cache.transfer_waiters);
2162
2163  if (wake_buffer_waiters)
2164    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
2165
2166  if (!cache_locked)
2167    rtems_bdbuf_unlock_cache ();
2168
2169  if (sc == RTEMS_SUCCESSFUL || sc == RTEMS_UNSATISFIED)
2170    return sc;
2171  else
2172    return RTEMS_IO_ERROR;
2173}
2174
2175static rtems_status_code
2176rtems_bdbuf_execute_read_request (rtems_disk_device  *dd,
2177                                  rtems_bdbuf_buffer *bd,
2178                                  uint32_t            transfer_count)
2179{
2180  rtems_blkdev_request *req = NULL;
2181  rtems_blkdev_bnum media_block = bd->block;
2182  uint32_t media_blocks_per_block = dd->media_blocks_per_block;
2183  uint32_t block_size = dd->block_size;
2184  uint32_t transfer_index = 1;
2185
2186  /*
2187   * TODO: This type of request structure is wrong and should be removed.
2188   */
2189#define bdbuf_alloc(size) __builtin_alloca (size)
2190
2191  req = bdbuf_alloc (rtems_bdbuf_read_request_size (transfer_count));
2192
2193  req->req = RTEMS_BLKDEV_REQ_READ;
2194  req->done = rtems_bdbuf_transfer_done;
2195  req->io_task = rtems_task_self ();
2196  req->bufnum = 0;
2197
2198  rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
2199
2200  req->bufs [0].user   = bd;
2201  req->bufs [0].block  = media_block;
2202  req->bufs [0].length = block_size;
2203  req->bufs [0].buffer = bd->buffer;
2204
2205  if (rtems_bdbuf_tracer)
2206    rtems_bdbuf_show_users ("read", bd);
2207
2208  while (transfer_index < transfer_count)
2209  {
2210    media_block += media_blocks_per_block;
2211
2212    bd = rtems_bdbuf_get_buffer_for_read_ahead (dd, media_block);
2213
2214    if (bd == NULL)
2215      break;
2216
2217    rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
2218
2219    req->bufs [transfer_index].user   = bd;
2220    req->bufs [transfer_index].block  = media_block;
2221    req->bufs [transfer_index].length = block_size;
2222    req->bufs [transfer_index].buffer = bd->buffer;
2223
2224    if (rtems_bdbuf_tracer)
2225      rtems_bdbuf_show_users ("read", bd);
2226
2227    ++transfer_index;
2228  }
2229
2230  req->bufnum = transfer_index;
2231
2232  return rtems_bdbuf_execute_transfer_request (dd, req, true);
2233}
2234
2235static bool
2236rtems_bdbuf_is_read_ahead_active (const rtems_disk_device *dd)
2237{
2238  return !rtems_chain_is_node_off_chain (&dd->read_ahead.node);
2239}
2240
2241static void
2242rtems_bdbuf_read_ahead_cancel (rtems_disk_device *dd)
2243{
2244  if (rtems_bdbuf_is_read_ahead_active (dd))
2245  {
2246    rtems_chain_extract_unprotected (&dd->read_ahead.node);
2247    rtems_chain_set_off_chain (&dd->read_ahead.node);
2248  }
2249}
2250
2251static void
2252rtems_bdbuf_read_ahead_reset (rtems_disk_device *dd)
2253{
2254  rtems_bdbuf_read_ahead_cancel (dd);
2255  dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
2256}
2257
2258static void
2259rtems_bdbuf_check_read_ahead_trigger (rtems_disk_device *dd,
2260                                      rtems_blkdev_bnum  block)
2261{
2262  if (bdbuf_cache.read_ahead_task != 0
2263      && dd->read_ahead.trigger == block
2264      && !rtems_bdbuf_is_read_ahead_active (dd))
2265  {
2266    rtems_status_code sc;
2267    rtems_chain_control *chain = &bdbuf_cache.read_ahead_chain;
2268
2269    if (rtems_chain_is_empty (chain))
2270    {
2271      sc = rtems_event_send (bdbuf_cache.read_ahead_task,
2272                             RTEMS_BDBUF_READ_AHEAD_WAKE_UP);
2273      if (sc != RTEMS_SUCCESSFUL)
2274        rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_RA_WAKE_UP);
2275    }
2276
2277    rtems_chain_append_unprotected (chain, &dd->read_ahead.node);
2278  }
2279}
2280
2281static void
2282rtems_bdbuf_set_read_ahead_trigger (rtems_disk_device *dd,
2283                                    rtems_blkdev_bnum  block)
2284{
2285  if (dd->read_ahead.trigger != block)
2286  {
2287    rtems_bdbuf_read_ahead_cancel (dd);
2288    dd->read_ahead.trigger = block + 1;
2289    dd->read_ahead.next = block + 2;
2290  }
2291}
2292
2293rtems_status_code
2294rtems_bdbuf_read (rtems_disk_device   *dd,
2295                  rtems_blkdev_bnum    block,
2296                  rtems_bdbuf_buffer **bd_ptr)
2297{
2298  rtems_status_code     sc = RTEMS_SUCCESSFUL;
2299  rtems_bdbuf_buffer   *bd = NULL;
2300  rtems_blkdev_bnum     media_block;
2301
2302  rtems_bdbuf_lock_cache ();
2303
2304  sc = rtems_bdbuf_get_media_block (dd, block, &media_block);
2305  if (sc == RTEMS_SUCCESSFUL)
2306  {
2307    if (rtems_bdbuf_tracer)
2308      printf ("bdbuf:read: %" PRIu32 " (%" PRIu32 ") (dev = %08x)\n",
2309              media_block, block, (unsigned) dd->dev);
2310
2311    bd = rtems_bdbuf_get_buffer_for_access (dd, media_block);
2312    switch (bd->state)
2313    {
2314      case RTEMS_BDBUF_STATE_CACHED:
2315        ++dd->stats.read_hits;
2316        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
2317        break;
2318      case RTEMS_BDBUF_STATE_MODIFIED:
2319        ++dd->stats.read_hits;
2320        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_MODIFIED);
2321        break;
2322      case RTEMS_BDBUF_STATE_EMPTY:
2323        ++dd->stats.read_misses;
2324        rtems_bdbuf_set_read_ahead_trigger (dd, block);
2325        sc = rtems_bdbuf_execute_read_request (dd, bd, 1);
2326        if (sc == RTEMS_SUCCESSFUL)
2327        {
2328          rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_ACCESS_CACHED);
2329          rtems_chain_extract_unprotected (&bd->link);
2330          rtems_bdbuf_group_obtain (bd);
2331        }
2332        else
2333        {
2334          bd = NULL;
2335        }
2336        break;
2337      default:
2338        rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_4);
2339        break;
2340    }
2341
2342    rtems_bdbuf_check_read_ahead_trigger (dd, block);
2343  }
2344
2345  rtems_bdbuf_unlock_cache ();
2346
2347  *bd_ptr = bd;
2348
2349  return sc;
2350}
2351
2352static rtems_status_code
2353rtems_bdbuf_check_bd_and_lock_cache (rtems_bdbuf_buffer *bd, const char *kind)
2354{
2355  if (bd == NULL)
2356    return RTEMS_INVALID_ADDRESS;
2357  if (rtems_bdbuf_tracer)
2358  {
2359    printf ("bdbuf:%s: %" PRIu32 "\n", kind, bd->block);
2360    rtems_bdbuf_show_users (kind, bd);
2361  }
2362  rtems_bdbuf_lock_cache();
2363
2364  return RTEMS_SUCCESSFUL;
2365}
2366
2367rtems_status_code
2368rtems_bdbuf_release (rtems_bdbuf_buffer *bd)
2369{
2370  rtems_status_code sc = RTEMS_SUCCESSFUL;
2371
2372  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "release");
2373  if (sc != RTEMS_SUCCESSFUL)
2374    return sc;
2375
2376  switch (bd->state)
2377  {
2378    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2379      rtems_bdbuf_add_to_lru_list_after_access (bd);
2380      break;
2381    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2382    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2383      rtems_bdbuf_discard_buffer_after_access (bd);
2384      break;
2385    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2386      rtems_bdbuf_add_to_modified_list_after_access (bd);
2387      break;
2388    default:
2389      rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_0);
2390      break;
2391  }
2392
2393  if (rtems_bdbuf_tracer)
2394    rtems_bdbuf_show_usage ();
2395
2396  rtems_bdbuf_unlock_cache ();
2397
2398  return RTEMS_SUCCESSFUL;
2399}
2400
2401rtems_status_code
2402rtems_bdbuf_release_modified (rtems_bdbuf_buffer *bd)
2403{
2404  rtems_status_code sc = RTEMS_SUCCESSFUL;
2405
2406  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "release modified");
2407  if (sc != RTEMS_SUCCESSFUL)
2408    return sc;
2409
2410  switch (bd->state)
2411  {
2412    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2413    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2414    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2415      rtems_bdbuf_add_to_modified_list_after_access (bd);
2416      break;
2417    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2418      rtems_bdbuf_discard_buffer_after_access (bd);
2419      break;
2420    default:
2421      rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_6);
2422      break;
2423  }
2424
2425  if (rtems_bdbuf_tracer)
2426    rtems_bdbuf_show_usage ();
2427
2428  rtems_bdbuf_unlock_cache ();
2429
2430  return RTEMS_SUCCESSFUL;
2431}
2432
2433rtems_status_code
2434rtems_bdbuf_sync (rtems_bdbuf_buffer *bd)
2435{
2436  rtems_status_code sc = RTEMS_SUCCESSFUL;
2437
2438  sc = rtems_bdbuf_check_bd_and_lock_cache (bd, "sync");
2439  if (sc != RTEMS_SUCCESSFUL)
2440    return sc;
2441
2442  switch (bd->state)
2443  {
2444    case RTEMS_BDBUF_STATE_ACCESS_CACHED:
2445    case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
2446    case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
2447      rtems_bdbuf_sync_after_access (bd);
2448      break;
2449    case RTEMS_BDBUF_STATE_ACCESS_PURGED:
2450      rtems_bdbuf_discard_buffer_after_access (bd);
2451      break;
2452    default:
2453      rtems_bdbuf_fatal_with_state (bd->state, RTEMS_BDBUF_FATAL_STATE_5);
2454      break;
2455  }
2456
2457  if (rtems_bdbuf_tracer)
2458    rtems_bdbuf_show_usage ();
2459
2460  rtems_bdbuf_unlock_cache ();
2461
2462  return RTEMS_SUCCESSFUL;
2463}
2464
2465rtems_status_code
2466rtems_bdbuf_syncdev (rtems_disk_device *dd)
2467{
2468  if (rtems_bdbuf_tracer)
2469    printf ("bdbuf:syncdev: %08x\n", (unsigned) dd->dev);
2470
2471  /*
2472   * Take the sync lock before locking the cache. Once we have the sync lock we
2473   * can lock the cache. If another thread has the sync lock it will cause this
2474   * thread to block until it owns the sync lock then it can own the cache. The
2475   * sync lock can only be obtained with the cache unlocked.
2476   */
2477  rtems_bdbuf_lock_sync ();
2478  rtems_bdbuf_lock_cache ();
2479
2480  /*
2481   * Set the cache to have a sync active for a specific device and let the swap
2482   * out task know the id of the requester to wake when done.
2483   *
2484   * The swap out task will negate the sync active flag when no more buffers
2485   * for the device are held on the "modified for sync" queues.
2486   */
2487  bdbuf_cache.sync_active    = true;
2488  bdbuf_cache.sync_requester = rtems_task_self ();
2489  bdbuf_cache.sync_device    = dd;
2490
2491  rtems_bdbuf_wake_swapper ();
2492  rtems_bdbuf_unlock_cache ();
2493  rtems_bdbuf_wait_for_transient_event ();
2494  rtems_bdbuf_unlock_sync ();
2495
2496  return RTEMS_SUCCESSFUL;
2497}
2498
2499/**
2500 * Swapout transfer to the driver. The driver will break this I/O into groups
2501 * of consecutive write requests is multiple consecutive buffers are required
2502 * by the driver. The cache is not locked.
2503 *
2504 * @param transfer The transfer transaction.
2505 */
2506static void
2507rtems_bdbuf_swapout_write (rtems_bdbuf_swapout_transfer* transfer)
2508{
2509  rtems_chain_node *node;
2510
2511  if (rtems_bdbuf_tracer)
2512    printf ("bdbuf:swapout transfer: %08x\n", (unsigned) transfer->dd->dev);
2513
2514  /*
2515   * If there are buffers to transfer to the media transfer them.
2516   */
2517  if (!rtems_chain_is_empty (&transfer->bds))
2518  {
2519    /*
2520     * The last block number used when the driver only supports
2521     * continuous blocks in a single request.
2522     */
2523    uint32_t last_block = 0;
2524
2525    rtems_disk_device *dd = transfer->dd;
2526    uint32_t media_blocks_per_block = dd->media_blocks_per_block;
2527    bool need_continuous_blocks =
2528      (dd->phys_dev->capabilities & RTEMS_BLKDEV_CAP_MULTISECTOR_CONT) != 0;
2529
2530    /*
2531     * Take as many buffers as configured and pass to the driver. Note, the
2532     * API to the drivers has an array of buffers and if a chain was passed
2533     * we could have just passed the list. If the driver API is updated it
2534     * should be possible to make this change with little effect in this
2535     * code. The array that is passed is broken in design and should be
2536     * removed. Merging members of a struct into the first member is
2537     * trouble waiting to happen.
2538     */
2539    transfer->write_req.status = RTEMS_RESOURCE_IN_USE;
2540    transfer->write_req.bufnum = 0;
2541
2542    while ((node = rtems_chain_get_unprotected(&transfer->bds)) != NULL)
2543    {
2544      rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
2545      bool                write = false;
2546
2547      /*
2548       * If the device only accepts sequential buffers and this is not the
2549       * first buffer (the first is always sequential, and the buffer is not
2550       * sequential then put the buffer back on the transfer chain and write
2551       * the committed buffers.
2552       */
2553
2554      if (rtems_bdbuf_tracer)
2555        printf ("bdbuf:swapout write: bd:%" PRIu32 ", bufnum:%" PRIu32 " mode:%s\n",
2556                bd->block, transfer->write_req.bufnum,
2557                need_continuous_blocks ? "MULTI" : "SCAT");
2558
2559      if (need_continuous_blocks && transfer->write_req.bufnum &&
2560          bd->block != last_block + media_blocks_per_block)
2561      {
2562        rtems_chain_prepend_unprotected (&transfer->bds, &bd->link);
2563        write = true;
2564      }
2565      else
2566      {
2567        rtems_blkdev_sg_buffer* buf;
2568        buf = &transfer->write_req.bufs[transfer->write_req.bufnum];
2569        transfer->write_req.bufnum++;
2570        buf->user   = bd;
2571        buf->block  = bd->block;
2572        buf->length = dd->block_size;
2573        buf->buffer = bd->buffer;
2574        last_block  = bd->block;
2575      }
2576
2577      /*
2578       * Perform the transfer if there are no more buffers, or the transfer
2579       * size has reached the configured max. value.
2580       */
2581
2582      if (rtems_chain_is_empty (&transfer->bds) ||
2583          (transfer->write_req.bufnum >= bdbuf_config.max_write_blocks))
2584        write = true;
2585
2586      if (write)
2587      {
2588        rtems_bdbuf_execute_transfer_request (dd, &transfer->write_req, false);
2589
2590        transfer->write_req.status = RTEMS_RESOURCE_IN_USE;
2591        transfer->write_req.bufnum = 0;
2592      }
2593    }
2594
2595    /*
2596     * If sync'ing and the deivce is capability of handling a sync IO control
2597     * call perform the call.
2598     */
2599    if (transfer->syncing &&
2600        (dd->phys_dev->capabilities & RTEMS_BLKDEV_CAP_SYNC))
2601    {
2602      /* int result = */ dd->ioctl (dd->phys_dev, RTEMS_BLKDEV_REQ_SYNC, NULL);
2603      /* How should the error be handled ? */
2604    }
2605  }
2606}
2607
2608/**
2609 * Process the modified list of buffers. There is a sync or modified list that
2610 * needs to be handled so we have a common function to do the work.
2611 *
2612 * @param dd_ptr Pointer to the device to handle. If BDBUF_INVALID_DEV no
2613 * device is selected so select the device of the first buffer to be written to
2614 * disk.
2615 * @param chain The modified chain to process.
2616 * @param transfer The chain to append buffers to be written too.
2617 * @param sync_active If true this is a sync operation so expire all timers.
2618 * @param update_timers If true update the timers.
2619 * @param timer_delta It update_timers is true update the timers by this
2620 *                    amount.
2621 */
2622static void
2623rtems_bdbuf_swapout_modified_processing (rtems_disk_device  **dd_ptr,
2624                                         rtems_chain_control* chain,
2625                                         rtems_chain_control* transfer,
2626                                         bool                 sync_active,
2627                                         bool                 update_timers,
2628                                         uint32_t             timer_delta)
2629{
2630  if (!rtems_chain_is_empty (chain))
2631  {
2632    rtems_chain_node* node = rtems_chain_head (chain);
2633    bool              sync_all;
2634
2635    node = node->next;
2636
2637    /*
2638     * A sync active with no valid dev means sync all.
2639     */
2640    if (sync_active && (*dd_ptr == BDBUF_INVALID_DEV))
2641      sync_all = true;
2642    else
2643      sync_all = false;
2644
2645    while (!rtems_chain_is_tail (chain, node))
2646    {
2647      rtems_bdbuf_buffer* bd = (rtems_bdbuf_buffer*) node;
2648
2649      /*
2650       * Check if the buffer's hold timer has reached 0. If a sync is active
2651       * or someone waits for a buffer written force all the timers to 0.
2652       *
2653       * @note Lots of sync requests will skew this timer. It should be based
2654       *       on TOD to be accurate. Does it matter ?
2655       */
2656      if (sync_all || (sync_active && (*dd_ptr == bd->dd))
2657          || rtems_bdbuf_has_buffer_waiters ())
2658        bd->hold_timer = 0;
2659
2660      if (bd->hold_timer)
2661      {
2662        if (update_timers)
2663        {
2664          if (bd->hold_timer > timer_delta)
2665            bd->hold_timer -= timer_delta;
2666          else
2667            bd->hold_timer = 0;
2668        }
2669
2670        if (bd->hold_timer)
2671        {
2672          node = node->next;
2673          continue;
2674        }
2675      }
2676
2677      /*
2678       * This assumes we can set it to BDBUF_INVALID_DEV which is just an
2679       * assumption. Cannot use the transfer list being empty the sync dev
2680       * calls sets the dev to use.
2681       */
2682      if (*dd_ptr == BDBUF_INVALID_DEV)
2683        *dd_ptr = bd->dd;
2684
2685      if (bd->dd == *dd_ptr)
2686      {
2687        rtems_chain_node* next_node = node->next;
2688        rtems_chain_node* tnode = rtems_chain_tail (transfer);
2689
2690        /*
2691         * The blocks on the transfer list are sorted in block order. This
2692         * means multi-block transfers for drivers that require consecutive
2693         * blocks perform better with sorted blocks and for real disks it may
2694         * help lower head movement.
2695         */
2696
2697        rtems_bdbuf_set_state (bd, RTEMS_BDBUF_STATE_TRANSFER);
2698
2699        rtems_chain_extract_unprotected (node);
2700
2701        tnode = tnode->previous;
2702
2703        while (node && !rtems_chain_is_head (transfer, tnode))
2704        {
2705          rtems_bdbuf_buffer* tbd = (rtems_bdbuf_buffer*) tnode;
2706
2707          if (bd->block > tbd->block)
2708          {
2709            rtems_chain_insert_unprotected (tnode, node);
2710            node = NULL;
2711          }
2712          else
2713            tnode = tnode->previous;
2714        }
2715
2716        if (node)
2717          rtems_chain_prepend_unprotected (transfer, node);
2718
2719        node = next_node;
2720      }
2721      else
2722      {
2723        node = node->next;
2724      }
2725    }
2726  }
2727}
2728
2729/**
2730 * Process the cache's modified buffers. Check the sync list first then the
2731 * modified list extracting the buffers suitable to be written to disk. We have
2732 * a device at a time. The task level loop will repeat this operation while
2733 * there are buffers to be written. If the transfer fails place the buffers
2734 * back on the modified list and try again later. The cache is unlocked while
2735 * the buffers are being written to disk.
2736 *
2737 * @param timer_delta It update_timers is true update the timers by this
2738 *                    amount.
2739 * @param update_timers If true update the timers.
2740 * @param transfer The transfer transaction data.
2741 *
2742 * @retval true Buffers where written to disk so scan again.
2743 * @retval false No buffers where written to disk.
2744 */
2745static bool
2746rtems_bdbuf_swapout_processing (unsigned long                 timer_delta,
2747                                bool                          update_timers,
2748                                rtems_bdbuf_swapout_transfer* transfer)
2749{
2750  rtems_bdbuf_swapout_worker* worker;
2751  bool                        transfered_buffers = false;
2752  bool                        sync_active;
2753
2754  rtems_bdbuf_lock_cache ();
2755
2756  /*
2757   * To set this to true you need the cache and the sync lock.
2758   */
2759  sync_active = bdbuf_cache.sync_active;
2760
2761  /*
2762   * If a sync is active do not use a worker because the current code does not
2763   * cleaning up after. We need to know the buffers have been written when
2764   * syncing to release sync lock and currently worker threads do not return to
2765   * here. We do not know the worker is the last in a sequence of sync writes
2766   * until after we have it running so we do not know to tell it to release the
2767   * lock. The simplest solution is to get the main swap out task perform all
2768   * sync operations.
2769   */
2770  if (sync_active)
2771    worker = NULL;
2772  else
2773  {
2774    worker = (rtems_bdbuf_swapout_worker*)
2775      rtems_chain_get_unprotected (&bdbuf_cache.swapout_free_workers);
2776    if (worker)
2777      transfer = &worker->transfer;
2778  }
2779
2780  rtems_chain_initialize_empty (&transfer->bds);
2781  transfer->dd = BDBUF_INVALID_DEV;
2782  transfer->syncing = sync_active;
2783
2784  /*
2785   * When the sync is for a device limit the sync to that device. If the sync
2786   * is for a buffer handle process the devices in the order on the sync
2787   * list. This means the dev is BDBUF_INVALID_DEV.
2788   */
2789  if (sync_active)
2790    transfer->dd = bdbuf_cache.sync_device;
2791
2792  /*
2793   * If we have any buffers in the sync queue move them to the modified
2794   * list. The first sync buffer will select the device we use.
2795   */
2796  rtems_bdbuf_swapout_modified_processing (&transfer->dd,
2797                                           &bdbuf_cache.sync,
2798                                           &transfer->bds,
2799                                           true, false,
2800                                           timer_delta);
2801
2802  /*
2803   * Process the cache's modified list.
2804   */
2805  rtems_bdbuf_swapout_modified_processing (&transfer->dd,
2806                                           &bdbuf_cache.modified,
2807                                           &transfer->bds,
2808                                           sync_active,
2809                                           update_timers,
2810                                           timer_delta);
2811
2812  /*
2813   * We have all the buffers that have been modified for this device so the
2814   * cache can be unlocked because the state of each buffer has been set to
2815   * TRANSFER.
2816   */
2817  rtems_bdbuf_unlock_cache ();
2818
2819  /*
2820   * If there are buffers to transfer to the media transfer them.
2821   */
2822  if (!rtems_chain_is_empty (&transfer->bds))
2823  {
2824    if (worker)
2825    {
2826      rtems_status_code sc = rtems_event_send (worker->id,
2827                                               RTEMS_BDBUF_SWAPOUT_SYNC);
2828      if (sc != RTEMS_SUCCESSFUL)
2829        rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_SO_WAKE_2);
2830    }
2831    else
2832    {
2833      rtems_bdbuf_swapout_write (transfer);
2834    }
2835
2836    transfered_buffers = true;
2837  }
2838
2839  if (sync_active && !transfered_buffers)
2840  {
2841    rtems_id sync_requester;
2842    rtems_bdbuf_lock_cache ();
2843    sync_requester = bdbuf_cache.sync_requester;
2844    bdbuf_cache.sync_active = false;
2845    bdbuf_cache.sync_requester = 0;
2846    rtems_bdbuf_unlock_cache ();
2847    if (sync_requester)
2848      rtems_event_transient_send (sync_requester);
2849  }
2850
2851  return transfered_buffers;
2852}
2853
2854/**
2855 * The swapout worker thread body.
2856 *
2857 * @param arg A pointer to the worker thread's private data.
2858 * @return rtems_task Not used.
2859 */
2860static rtems_task
2861rtems_bdbuf_swapout_worker_task (rtems_task_argument arg)
2862{
2863  rtems_bdbuf_swapout_worker* worker = (rtems_bdbuf_swapout_worker*) arg;
2864
2865  while (worker->enabled)
2866  {
2867    rtems_bdbuf_wait_for_event (RTEMS_BDBUF_SWAPOUT_SYNC);
2868
2869    rtems_bdbuf_swapout_write (&worker->transfer);
2870
2871    rtems_bdbuf_lock_cache ();
2872
2873    rtems_chain_initialize_empty (&worker->transfer.bds);
2874    worker->transfer.dd = BDBUF_INVALID_DEV;
2875
2876    rtems_chain_append_unprotected (&bdbuf_cache.swapout_free_workers, &worker->link);
2877
2878    rtems_bdbuf_unlock_cache ();
2879  }
2880
2881  free (worker);
2882
2883  rtems_task_delete (RTEMS_SELF);
2884}
2885
2886/**
2887 * Close the swapout worker threads.
2888 */
2889static void
2890rtems_bdbuf_swapout_workers_close (void)
2891{
2892  rtems_chain_node* node;
2893
2894  rtems_bdbuf_lock_cache ();
2895
2896  node = rtems_chain_first (&bdbuf_cache.swapout_free_workers);
2897  while (!rtems_chain_is_tail (&bdbuf_cache.swapout_free_workers, node))
2898  {
2899    rtems_bdbuf_swapout_worker* worker = (rtems_bdbuf_swapout_worker*) node;
2900    worker->enabled = false;
2901    rtems_event_send (worker->id, RTEMS_BDBUF_SWAPOUT_SYNC);
2902    node = rtems_chain_next (node);
2903  }
2904
2905  rtems_bdbuf_unlock_cache ();
2906}
2907
2908/**
2909 * Body of task which takes care on flushing modified buffers to the disk.
2910 *
2911 * @param arg A pointer to the global cache data. Use the global variable and
2912 *            not this.
2913 * @return rtems_task Not used.
2914 */
2915static rtems_task
2916rtems_bdbuf_swapout_task (rtems_task_argument arg)
2917{
2918  rtems_bdbuf_swapout_transfer* transfer = (rtems_bdbuf_swapout_transfer *) arg;
2919  uint32_t                      period_in_ticks;
2920  const uint32_t                period_in_msecs = bdbuf_config.swapout_period;
2921  uint32_t                      timer_delta;
2922
2923  /*
2924   * Localise the period.
2925   */
2926  period_in_ticks = RTEMS_MICROSECONDS_TO_TICKS (period_in_msecs * 1000);
2927
2928  /*
2929   * This is temporary. Needs to be changed to use the real time clock.
2930   */
2931  timer_delta = period_in_msecs;
2932
2933  while (bdbuf_cache.swapout_enabled)
2934  {
2935    rtems_event_set   out;
2936    rtems_status_code sc;
2937
2938    /*
2939     * Only update the timers once in the processing cycle.
2940     */
2941    bool update_timers = true;
2942
2943    /*
2944     * If we write buffers to any disk perform a check again. We only write a
2945     * single device at a time and the cache may have more than one device's
2946     * buffers modified waiting to be written.
2947     */
2948    bool transfered_buffers;
2949
2950    do
2951    {
2952      transfered_buffers = false;
2953
2954      /*
2955       * Extact all the buffers we find for a specific device. The device is
2956       * the first one we find on a modified list. Process the sync queue of
2957       * buffers first.
2958       */
2959      if (rtems_bdbuf_swapout_processing (timer_delta,
2960                                          update_timers,
2961                                          transfer))
2962      {
2963        transfered_buffers = true;
2964      }
2965
2966      /*
2967       * Only update the timers once.
2968       */
2969      update_timers = false;
2970    }
2971    while (transfered_buffers);
2972
2973    sc = rtems_event_receive (RTEMS_BDBUF_SWAPOUT_SYNC,
2974                              RTEMS_EVENT_ALL | RTEMS_WAIT,
2975                              period_in_ticks,
2976                              &out);
2977
2978    if ((sc != RTEMS_SUCCESSFUL) && (sc != RTEMS_TIMEOUT))
2979      rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_SWAPOUT_RE);
2980  }
2981
2982  rtems_bdbuf_swapout_workers_close ();
2983
2984  free (transfer);
2985
2986  rtems_task_delete (RTEMS_SELF);
2987}
2988
2989static void
2990rtems_bdbuf_purge_list (rtems_chain_control *purge_list)
2991{
2992  bool wake_buffer_waiters = false;
2993  rtems_chain_node *node = NULL;
2994
2995  while ((node = rtems_chain_get_unprotected (purge_list)) != NULL)
2996  {
2997    rtems_bdbuf_buffer *bd = (rtems_bdbuf_buffer *) node;
2998
2999    if (bd->waiters == 0)
3000      wake_buffer_waiters = true;
3001
3002    rtems_bdbuf_discard_buffer (bd);
3003  }
3004
3005  if (wake_buffer_waiters)
3006    rtems_bdbuf_wake (&bdbuf_cache.buffer_waiters);
3007}
3008
3009static void
3010rtems_bdbuf_gather_for_purge (rtems_chain_control *purge_list,
3011                              const rtems_disk_device *dd)
3012{
3013  rtems_bdbuf_buffer *stack [RTEMS_BDBUF_AVL_MAX_HEIGHT];
3014  rtems_bdbuf_buffer **prev = stack;
3015  rtems_bdbuf_buffer *cur = bdbuf_cache.tree;
3016
3017  *prev = NULL;
3018
3019  while (cur != NULL)
3020  {
3021    if (cur->dd == dd)
3022    {
3023      switch (cur->state)
3024      {
3025        case RTEMS_BDBUF_STATE_FREE:
3026        case RTEMS_BDBUF_STATE_EMPTY:
3027        case RTEMS_BDBUF_STATE_ACCESS_PURGED:
3028        case RTEMS_BDBUF_STATE_TRANSFER_PURGED:
3029          break;
3030        case RTEMS_BDBUF_STATE_SYNC:
3031          rtems_bdbuf_wake (&bdbuf_cache.transfer_waiters);
3032          /* Fall through */
3033        case RTEMS_BDBUF_STATE_MODIFIED:
3034          rtems_bdbuf_group_release (cur);
3035          /* Fall through */
3036        case RTEMS_BDBUF_STATE_CACHED:
3037          rtems_chain_extract_unprotected (&cur->link);
3038          rtems_chain_append_unprotected (purge_list, &cur->link);
3039          break;
3040        case RTEMS_BDBUF_STATE_TRANSFER:
3041          rtems_bdbuf_set_state (cur, RTEMS_BDBUF_STATE_TRANSFER_PURGED);
3042          break;
3043        case RTEMS_BDBUF_STATE_ACCESS_CACHED:
3044        case RTEMS_BDBUF_STATE_ACCESS_EMPTY:
3045        case RTEMS_BDBUF_STATE_ACCESS_MODIFIED:
3046          rtems_bdbuf_set_state (cur, RTEMS_BDBUF_STATE_ACCESS_PURGED);
3047          break;
3048        default:
3049          rtems_bdbuf_fatal (RTEMS_BDBUF_FATAL_STATE_11);
3050      }
3051    }
3052
3053    if (cur->avl.left != NULL)
3054    {
3055      /* Left */
3056      ++prev;
3057      *prev = cur;
3058      cur = cur->avl.left;
3059    }
3060    else if (cur->avl.right != NULL)
3061    {
3062      /* Right */
3063      ++prev;
3064      *prev = cur;
3065      cur = cur->avl.right;
3066    }
3067    else
3068    {
3069      while (*prev != NULL
3070             && (cur == (*prev)->avl.right || (*prev)->avl.right == NULL))
3071      {
3072        /* Up */
3073        cur = *prev;
3074        --prev;
3075      }
3076      if (*prev != NULL)
3077        /* Right */
3078        cur = (*prev)->avl.right;
3079      else
3080        /* Finished */
3081        cur = NULL;
3082    }
3083  }
3084}
3085
3086static void
3087rtems_bdbuf_do_purge_dev (rtems_disk_device *dd)
3088{
3089  rtems_chain_control purge_list;
3090
3091  rtems_chain_initialize_empty (&purge_list);
3092  rtems_bdbuf_read_ahead_reset (dd);
3093  rtems_bdbuf_gather_for_purge (&purge_list, dd);
3094  rtems_bdbuf_purge_list (&purge_list);
3095}
3096
3097void
3098rtems_bdbuf_purge_dev (rtems_disk_device *dd)
3099{
3100  rtems_bdbuf_lock_cache ();
3101  rtems_bdbuf_do_purge_dev (dd);
3102  rtems_bdbuf_unlock_cache ();
3103}
3104
3105rtems_status_code
3106rtems_bdbuf_set_block_size (rtems_disk_device *dd,
3107                            uint32_t           block_size,
3108                            bool               sync)
3109{
3110  rtems_status_code sc = RTEMS_SUCCESSFUL;
3111
3112  /*
3113   * We do not care about the synchronization status since we will purge the
3114   * device later.
3115   */
3116  if (sync)
3117    rtems_bdbuf_syncdev (dd);
3118
3119  rtems_bdbuf_lock_cache ();
3120
3121  if (block_size > 0)
3122  {
3123    size_t bds_per_group = rtems_bdbuf_bds_per_group (block_size);
3124
3125    if (bds_per_group != 0)
3126    {
3127      int block_to_media_block_shift = 0;
3128      uint32_t media_blocks_per_block = block_size / dd->media_block_size;
3129      uint32_t one = 1;
3130
3131      while ((one << block_to_media_block_shift) < media_blocks_per_block)
3132      {
3133        ++block_to_media_block_shift;
3134      }
3135
3136      if ((dd->media_block_size << block_to_media_block_shift) != block_size)
3137        block_to_media_block_shift = -1;
3138
3139      dd->block_size = block_size;
3140      dd->block_count = dd->size / media_blocks_per_block;
3141      dd->media_blocks_per_block = media_blocks_per_block;
3142      dd->block_to_media_block_shift = block_to_media_block_shift;
3143      dd->bds_per_group = bds_per_group;
3144
3145      rtems_bdbuf_do_purge_dev (dd);
3146    }
3147    else
3148    {
3149      sc = RTEMS_INVALID_NUMBER;
3150    }
3151  }
3152  else
3153  {
3154    sc = RTEMS_INVALID_NUMBER;
3155  }
3156
3157  rtems_bdbuf_unlock_cache ();
3158
3159  return sc;
3160}
3161
3162static rtems_task
3163rtems_bdbuf_read_ahead_task (rtems_task_argument arg)
3164{
3165  rtems_chain_control *chain = &bdbuf_cache.read_ahead_chain;
3166
3167  while (bdbuf_cache.read_ahead_enabled)
3168  {
3169    rtems_chain_node *node;
3170
3171    rtems_bdbuf_wait_for_event (RTEMS_BDBUF_READ_AHEAD_WAKE_UP);
3172    rtems_bdbuf_lock_cache ();
3173
3174    while ((node = rtems_chain_get_unprotected (chain)) != NULL)
3175    {
3176      rtems_disk_device *dd =
3177        RTEMS_CONTAINER_OF (node, rtems_disk_device, read_ahead.node);
3178      rtems_blkdev_bnum block = dd->read_ahead.next;
3179      rtems_blkdev_bnum media_block = 0;
3180      rtems_status_code sc =
3181        rtems_bdbuf_get_media_block (dd, block, &media_block);
3182
3183      rtems_chain_set_off_chain (&dd->read_ahead.node);
3184
3185      if (sc == RTEMS_SUCCESSFUL)
3186      {
3187        rtems_bdbuf_buffer *bd =
3188          rtems_bdbuf_get_buffer_for_read_ahead (dd, media_block);
3189
3190        if (bd != NULL)
3191        {
3192          uint32_t transfer_count = dd->block_count - block;
3193          uint32_t max_transfer_count = bdbuf_config.max_read_ahead_blocks;
3194
3195          if (transfer_count >= max_transfer_count)
3196          {
3197            transfer_count = max_transfer_count;
3198            dd->read_ahead.trigger = block + transfer_count / 2;
3199            dd->read_ahead.next = block + transfer_count;
3200          }
3201          else
3202          {
3203            dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
3204          }
3205
3206          ++dd->stats.read_ahead_transfers;
3207          rtems_bdbuf_execute_read_request (dd, bd, transfer_count);
3208        }
3209      }
3210      else
3211      {
3212        dd->read_ahead.trigger = RTEMS_DISK_READ_AHEAD_NO_TRIGGER;
3213      }
3214    }
3215
3216    rtems_bdbuf_unlock_cache ();
3217  }
3218
3219  rtems_task_delete (RTEMS_SELF);
3220}
3221
3222void rtems_bdbuf_get_device_stats (const rtems_disk_device *dd,
3223                                   rtems_blkdev_stats      *stats)
3224{
3225  rtems_bdbuf_lock_cache ();
3226  *stats = dd->stats;
3227  rtems_bdbuf_unlock_cache ();
3228}
3229
3230void rtems_bdbuf_reset_device_stats (rtems_disk_device *dd)
3231{
3232  rtems_bdbuf_lock_cache ();
3233  memset (&dd->stats, 0, sizeof(dd->stats));
3234  rtems_bdbuf_unlock_cache ();
3235}
Note: See TracBrowser for help on using the repository browser.