source: rtems/cpukit/libblock/include/rtems/bdbuf.h @ b5b07cad

4.104.115
Last change on this file since b5b07cad was b5b07cad, checked in by Thomas Doerfler <Thomas.Doerfler@…>, on 10/29/09 at 12:50:01
  • - Reorderd AVL node fields to save space
  • Fixed printf() formats. New structure for waiters synchronization. Added BDBUF_INVALID_DEV define. New error handling in rtems_bdbuf_init().
  • Release disk in case of an error.
  • Property mode set to 100644
File size: 18.9 KB
Line 
1/**
2 * @file
3 *
4 * @ingroup rtems_bdbuf
5 *
6 * Block device buffer management.
7 */
8 
9/*
10 * Copyright (C) 2001 OKTET Ltd., St.-Petersburg, Russia
11 * Author: Victor V. Vengerov <vvv@oktet.ru>
12 *
13 * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org>
14 *    Rewritten to remove score mutex access. Fixes many performance
15 *    issues.
16      Change to support demand driven variable buffer sizes.
17 *
18 * @(#) bdbuf.h,v 1.9 2005/02/02 00:06:18 joel Exp
19 */
20
21#ifndef _RTEMS_BDBUF_H
22#define _RTEMS_BDBUF_H
23
24#include <rtems.h>
25#include <rtems/libio.h>
26#include <rtems/chain.h>
27
28#include <rtems/blkdev.h>
29#include <rtems/diskdevs.h>
30
31#ifdef __cplusplus
32extern "C" {
33#endif
34
35/**
36 * @defgroup rtems_libblock Block Device Library
37 *
38 * Block device modules.
39 */
40
41/**
42 * @defgroup rtems_bdbuf Block Device Buffer Management
43 *
44 * @ingroup rtems_libblock
45 *
46 * The Block Device Buffer Management implements a cache between the disk
47 * devices and file systems. The code provides read ahead and write queuing to
48 * the drivers and fast cache look up using an AVL tree.
49 *
50 * The block size used by a file system can be set at runtime and must be a
51 * multiple of the disk device block size. The disk device's physical block
52 * size is called the media block size. The file system can set the block size
53 * it uses to a larger multiple of the media block size. The driver must be
54 * able to handle buffers sizes larger than one media block.
55 *
56 * The user configures the amount of memory to be used as buffers in the cache,
57 * and the minimum and maximum buffer size. The cache will allocate additional
58 * memory for the buffer descriptors and groups. There are enough buffer
59 * descriptors allocated so all the buffer memory can be used as minimum sized
60 * buffers.
61 *
62 * The cache is a single pool of buffers. The buffer memory is divided into
63 * groups where the size of buffer memory allocated to a group is the maximum
64 * buffer size. A group's memory can be divided down into small buffer sizes
65 * that are a multiple of 2 of the minimum buffer size. A group is the minumum
66 * allocation unit for buffers of a specific size. If a buffer of maximum size
67 * is request the group will have a single buffer. If a buffer of minium size
68 * is requested the group is divided into minimum sized buffers and the
69 * remaining buffers are held ready for use. A group keeps track of which
70 * buffers are with a file system or driver and groups who have buffer in use
71 * cannot be realloced. Groups with no buffers in use can be taken and
72 * realloced to a new size. This is how buffers of different sizes move around
73 * the cache.
74
75 * The buffers are held in various lists in the cache. All buffers follow this
76 * state machine:
77 *                                 
78 * @dot
79 * digraph g {
80 *   ready [label="Ready\nRead Ahead"];
81 *   transfer [label="Transfer"];
82 *   accessed [label="Accessed\nAccessed Modified"];
83 *   modified [label="Modified\nSynchronized"];
84 *   cached [label="Cached"];
85 *   ready -> transfer [label="Read\nRead Ahead"];
86 *   transfer -> ready [label="Read Ahead Complete"];
87 *   ready -> accessed [label="Get"];
88 *   transfer -> accessed [label="Read or Write\nComplete"];
89 *   transfer -> cached [label="Read or Write\nComplete"];
90 *   accessed -> cached [label="Release"];
91 *   cached -> accessed [label="Get"];
92 *   modified -> accessed [label="Get"];
93 *   accessed -> modified [label="Modified"];
94 *   accessed -> transfer [label="Swap"];
95 * }
96 * @enddot
97 *         
98 * Empty buffers are added to the ready list and removed from this queue when a
99 * caller requests a buffer. This is referred to as getting a buffer in the
100 * code and the event get in the state diagram. The buffer is assigned to a
101 * block and inserted to the AVL based on the block/device key. If the block is
102 * to be read by the user and not in the cache (ready) it is transfered from
103 * the disk into memory. If no ready buffers exist the buffer is taken from the
104 * LRU list. If no buffers are on the LRU list the modified list is check. If
105 * no buffers are on the modified list the request blocks. If buffers are on
106 * the modified list the buffers hold timer is expired and the swap out task
107 * woken.
108 *
109 * A block being accessed is given to the file system layer and not accessable
110 * to another requester until released back to the cache. The same goes to a
111 * buffer in the transfer state. The transfer state means being read or
112 * written. If the file system has modifed the block and releases it as
113 * modified it placed on the cache's modified list and a hold timer
114 * initialised. The buffer is held for the hold time before being written to
115 * disk. Buffers are held for a configurable period of time on the modified
116 * list as a write sets the state to transfer and this locks the buffer out
117 * from the file system until the write completes. Buffers are often accessed
118 * and modified in a series of small updates so if sent to the disk when
119 * released as modified the user would have to block waiting until it had been
120 * written. This would be a performance problem.
121 *
122 * The code performs mulitple block reads and writes. Multiple block reads or
123 * read ahead increases performance with hardware that supports it. It also
124 * helps with a large cache as the disk head movement is reduced. It how-ever
125 * is a speculative operation so excessive use can remove valuable and needed
126 * blocks from the cache. The get call knows if a read is a for the file system
127 * or if it is a read ahead get. If the get is for a read ahead block and the
128 * block is already in the cache or no ready buffers are available the read
129 * ahead is stopped. The transfer occurs with the blocks so far. If a buffer is
130 * in the read ahead state and release it is placed on the ready list rather
131 * than the LRU list. This means these buffers are used before buffers used by
132 * the file system.
133 *
134 * The cache has the following lists of buffers:
135 *  - @c ready: Empty buffers created when the pool is initialised.
136 *  - @c modified: Buffers waiting to be written to disk.
137 *  - @c sync: Buffers to be synced to disk.
138 *  - @c lru: Accessed buffers released in least recently used order.
139 *
140 * The cache scans the ready list then the LRU list for a suitable buffer in
141 * this order. A suitable buffer is one that matches the same allocation size
142 * as the device the buffer is for. The a buffer's group has no buffers in use
143 * with the file system or driver the group is reallocated. This means the
144 * buffers in the group are invalidated, resized and placed on the ready queue.
145 * There is a performance issue with this design. The reallocation of a group
146 * may forced recently accessed buffers out of the cache when they should
147 * not. The design should be change to have groups on a LRU list if they have
148 * no buffers in use.
149 *
150 * @{
151 */
152
153/**
154 * State of a buffer in the cache.
155 */
156typedef enum
157{
158  RTEMS_BDBUF_STATE_EMPTY = 0,            /**< Not in use. */
159  RTEMS_BDBUF_STATE_READ_AHEAD = 1,       /**< Holds read ahead data only */
160  RTEMS_BDBUF_STATE_CACHED = 2,           /**< In the cache and available */
161  RTEMS_BDBUF_STATE_ACCESS = 3,           /**< The user has the buffer */
162  RTEMS_BDBUF_STATE_MODIFIED = 4,         /**< In the cache but modified */
163  RTEMS_BDBUF_STATE_ACCESS_MODIFIED = 5,  /**< With the user but modified */
164  RTEMS_BDBUF_STATE_SYNC = 6,             /**< Requested to be sync'ed */
165  RTEMS_BDBUF_STATE_TRANSFER = 7          /**< Being transferred to or from disk */
166} rtems_bdbuf_buf_state;
167
168/**
169 * Forward reference to the block.
170 */
171struct rtems_bdbuf_group;
172typedef struct rtems_bdbuf_group rtems_bdbuf_group;
173
174/**
175 * To manage buffers we using buffer descriptors (BD). A BD holds a buffer plus
176 * a range of other information related to managing the buffer in the cache. To
177 * speed-up buffer lookup descriptors are organized in AVL-Tree. The fields
178 * 'dev' and 'block' are search keys.
179 */
180typedef struct rtems_bdbuf_buffer
181{
182  rtems_chain_node link;       /**< Link the BD onto a number of lists. */
183
184  struct rtems_bdbuf_avl_node
185  {
186    struct rtems_bdbuf_buffer* left;   /**< Left Child */
187    struct rtems_bdbuf_buffer* right;  /**< Right Child */
188    signed char                cache;  /**< Cache */
189    signed char                bal;    /**< The balance of the sub-tree */
190  } avl;
191
192  dev_t             dev;        /**< device number */
193
194  rtems_blkdev_bnum block;      /**< block number on the device */
195
196  unsigned char*    buffer;     /**< Pointer to the buffer memory area */
197  int               error;      /**< If not 0 indicate an error value (errno)
198                                 * which can be used by user later */
199
200  volatile rtems_bdbuf_buf_state state;  /**< State of the buffer. */
201
202  volatile uint32_t  waiters;    /**< The number of threads waiting on this
203                                  * buffer. */
204  rtems_bdbuf_group* group;      /**< Pointer to the group of BDs this BD is
205                                  * part of. */
206  volatile uint32_t  hold_timer; /**< Timer to indicate how long a buffer
207                                  * has been held in the cache modified. */
208
209  int   references;              /**< Allow reference counting by owner. */
210  void* user;                    /**< User data. */
211} rtems_bdbuf_buffer;
212
213/**
214 * A group is a continuous block of buffer descriptors. A group covers the
215 * maximum configured buffer size and is the allocation size for the buffers to
216 * a specific buffer size. If you allocate a buffer to be a specific size, all
217 * buffers in the group, if there are more than 1 will also be that size. The
218 * number of buffers in a group is a multiple of 2, ie 1, 2, 4, 8, etc.
219 */
220struct rtems_bdbuf_group
221{
222  rtems_chain_node    link;          /**< Link the groups on a LRU list if they
223                                      * have no buffers in use. */
224  size_t              bds_per_group; /**< The number of BD allocated to this
225                                      * group. This value must be a multiple of
226                                      * 2. */
227  uint32_t            users;         /**< How many users the block has. */
228  rtems_bdbuf_buffer* bdbuf;         /**< First BD this block covers. */
229};
230
231/**
232 * Buffering configuration definition. See confdefs.h for support on using this
233 * structure.
234 */
235typedef struct rtems_bdbuf_config {
236  uint32_t            max_read_ahead_blocks;   /**< Number of blocks to read
237                                                * ahead. */
238  uint32_t            max_write_blocks;        /**< Number of blocks to write
239                                                * at once. */
240  rtems_task_priority swapout_priority;        /**< Priority of the swap out
241                                                * task. */
242  uint32_t            swapout_period;          /**< Period swapout checks buf
243                                                * timers. */
244  uint32_t            swap_block_hold;         /**< Period a buffer is held. */
245  size_t              swapout_workers;         /**< The number of worker
246                                                * threads for the swapout
247                                                * task. */
248  rtems_task_priority swapout_worker_priority; /**< Priority of the swap out
249                                                * task. */
250  size_t              size;                    /**< Size of memory in the
251                                                * cache */
252  uint32_t            buffer_min;              /**< Minimum buffer size. */
253  uint32_t            buffer_max;              /**< Maximum buffer size
254                                                * supported. It is also the
255                                                * allocation size. */
256} rtems_bdbuf_config;
257
258/**
259 * External reference to the configuration.
260 *
261 * The configuration is provided by the application.
262 */
263extern const rtems_bdbuf_config rtems_bdbuf_configuration;
264
265/**
266 * The max_read_ahead_blocks value is altered if there are fewer buffers
267 * than this defined max. This stops thrashing in the cache.
268 */
269#define RTEMS_BDBUF_MAX_READ_AHEAD_BLOCKS_DEFAULT    32
270
271/**
272 * Default maximum number of blocks to write at once.
273 */
274#define RTEMS_BDBUF_MAX_WRITE_BLOCKS_DEFAULT         16
275
276/**
277 * Default swap-out task priority.
278 */
279#define RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT    15
280
281/**
282 * Default swap-out task swap period in milli seconds.
283 */
284#define RTEMS_BDBUF_SWAPOUT_TASK_SWAP_PERIOD_DEFAULT 250
285
286/**
287 * Default swap-out task block hold time in milli seconds.
288 */
289#define RTEMS_BDBUF_SWAPOUT_TASK_BLOCK_HOLD_DEFAULT  1000
290
291/**
292 * Default swap-out worker tasks. Currently disabled.
293 */
294#define RTEMS_BDBUF_SWAPOUT_WORKER_TASKS_DEFAULT     0
295
296/**
297 * Default swap-out worker task priority. The same as the swapout task.
298 */
299#define RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT \
300                             RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT
301
302/**
303 * Default size of memory allocated to the cache.
304 */
305#define RTEMS_BDBUF_CACHE_MEMORY_SIZE_DEFAULT (64 * 512)
306
307/**
308 * Default minimum size of buffers.
309 */
310#define RTEMS_BDBUF_BUFFER_MIN_SIZE_DEFAULT (512)
311
312/**
313 * Default maximum size of buffers.
314 */
315#define RTEMS_BDBUF_BUFFER_MAX_SIZE_DEFAULT (4096)
316
317/**
318 * Prepare buffering layer to work - initialize buffer descritors and (if it is
319 * neccessary) buffers. After initialization all blocks is placed into the
320 * ready state.
321 *
322 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
323 *         successfully or error code if error is occured)
324 */
325rtems_status_code
326rtems_bdbuf_init (void);
327
328/**
329 * Get block buffer for data to be written into. The buffers is set to the
330 * access or modifed access state. If the buffer is in the cache and modified
331 * the state is access modified else the state is access. This buffer contents
332 * are not initialised if the buffer is not already in the cache. If the block
333 * is already resident in memory it is returned how-ever if not in memory the
334 * buffer is not read from disk. This call is used when writing the whole block
335 * on a disk rather than just changing a part of it. If there is no buffers
336 * available this call will block. A buffer obtained with this call will not be
337 * involved in a transfer request and will not be returned to another user
338 * until released. If the buffer is already with a user when this call is made
339 * the call is blocked until the buffer is returned. The highest priority
340 * waiter will obtain the buffer first.
341 *
342 * The block number is the linear block number. This is relative to the start
343 * of the partition on the media.
344 *
345 * @param device Device number (constructed of major and minor device number)
346 * @param block  Linear media block number
347 * @param bd     Reference to the buffer descriptor pointer.
348 *
349 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
350 *               successfully or error code if error is occured)
351 */
352rtems_status_code
353rtems_bdbuf_get (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);
354
355/**
356 * Get the block buffer and if not already in the cache read from the disk. If
357 * specified block already cached return. The buffer is set to the access or
358 * modifed access state. If the buffer is in the cache and modified the state
359 * is access modified else the state is access. If block is already being read
360 * from disk for being written to disk this call blocks. If the buffer is
361 * waiting to be written it is removed from modified queue and returned to the
362 * user. If the buffer is not in the cache a new buffer is obtained and the
363 * data read from disk. The call may block until these operations complete. A
364 * buffer obtained with this call will not be involved in a transfer request
365 * and will not be returned to another user until released. If the buffer is
366 * already with a user when this call is made the call is blocked until the
367 * buffer is returned. The highest priority waiter will obtain the buffer
368 * first.
369 *
370 * @param device Device number (constructed of major and minor device number)
371 * @param block  Linear media block number
372 * @param bd     Reference to the buffer descriptor pointer.
373 *
374 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
375 *               successfully or error code if error is occured)
376 */
377rtems_status_code
378rtems_bdbuf_read (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);
379
380/**
381 * Release the buffer obtained by a read call back to the cache. If the buffer
382 * was obtained by a get call and was not already in the cache the release
383 * modified call should be used. A buffer released with this call obtained by a
384 * get call may not be in sync with the contents on disk. If the buffer was in
385 * the cache and modified before this call it will be returned to the modified
386 * queue. The buffers is returned to the end of the LRU list.
387 *
388 * @param bd Reference to the buffer descriptor.
389 *
390 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
391 *         successfully or error code if error is occured)
392 */
393rtems_status_code
394rtems_bdbuf_release (rtems_bdbuf_buffer* bd);
395
396/**
397 * Release the buffer allocated with a get or read call placing it on the
398 * modidied list.  If the buffer was not released modified before the hold
399 * timer is set to the configuration value. If the buffer had been released
400 * modified before but not written to disk the hold timer is not updated. The
401 * buffer will be written to disk when the hold timer has expired, there are
402 * not more buffers available in the cache and a get or read buffer needs one
403 * or a sync call has been made. If the buffer is obtained with a get or read
404 * before the hold timer has expired the buffer will be returned to the user.
405 *
406 * @param bd Reference to the buffer descriptor.
407 *
408 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
409 *         successfully or error code if error is occured)
410 */
411rtems_status_code
412rtems_bdbuf_release_modified (rtems_bdbuf_buffer* bd);
413
414/**
415 * Release the buffer as modified and wait until it has been synchronized with
416 * the disk by writing it. This buffer will be the first to be transfer to disk
417 * and other buffers may also be written if the maximum number of blocks in a
418 * requests allows it.
419 *
420 * @note This code does not lock the sync mutex and stop additions to the
421 *       modified queue.
422
423 * @param bd Reference to the buffer descriptor.
424 *
425 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
426 *         successfully or error code if error is occured)
427 */
428rtems_status_code
429rtems_bdbuf_sync (rtems_bdbuf_buffer* bd);
430
431/**
432 * Synchronize all modified buffers for this device with the disk and wait
433 * until the transfers have completed. The sync mutex for the cache is locked
434 * stopping the addition of any further modifed buffers. It is only the
435 * currently modified buffers that are written.
436 *
437 * @note Nesting calls to sync multiple devices will be handled sequentially. A
438 * nested call will be blocked until the first sync request has complete.
439 *
440 * @param dev Block device number
441 *
442 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
443 *         successfully or error code if error is occured)
444 */
445rtems_status_code
446rtems_bdbuf_syncdev (dev_t dev);
447
448/** @} */
449
450#ifdef __cplusplus
451}
452#endif
453
454#endif
Note: See TracBrowser for help on using the repository browser.