source: rtems/cpukit/libblock/include/rtems/bdbuf.h @ 5c587596

4.104.115
Last change on this file since 5c587596 was 5c587596, checked in by Thomas Doerfler <Thomas.Doerfler@…>, on 01/19/10 at 09:10:03

libblock API update

  • Property mode set to 100644
File size: 21.1 KB
Line 
1/**
2 * @file
3 *
4 * @ingroup rtems_bdbuf
5 *
6 * Block device buffer management.
7 */
8
9/*
10 * Copyright (C) 2001 OKTET Ltd., St.-Petersburg, Russia
11 * Author: Victor V. Vengerov <vvv@oktet.ru>
12 *
13 * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org>
14 *    Rewritten to remove score mutex access. Fixes many performance
15 *    issues.
16 *    Change to support demand driven variable buffer sizes.
17 *
18 * Copyright (c) 2009 embedded brains GmbH.
19 *
20 * @(#) bdbuf.h,v 1.9 2005/02/02 00:06:18 joel Exp
21 */
22
23#ifndef _RTEMS_BDBUF_H
24#define _RTEMS_BDBUF_H
25
26#include <rtems.h>
27#include <rtems/libio.h>
28#include <rtems/chain.h>
29
30#include <rtems/blkdev.h>
31#include <rtems/diskdevs.h>
32
33#ifdef __cplusplus
34extern "C" {
35#endif
36
37/**
38 * @defgroup rtems_libblock Block Device Library
39 *
40 * Block device modules.
41 */
42
43/**
44 * @defgroup rtems_bdbuf Block Device Buffer Management
45 *
46 * @ingroup rtems_libblock
47 *
48 * The Block Device Buffer Management implements a cache between the disk
49 * devices and file systems.  The code provides read ahead and write queuing to
50 * the drivers and fast cache look-up using an AVL tree.
51 *
52 * The block size used by a file system can be set at runtime and must be a
53 * multiple of the disk device block size.  The disk device's physical block
54 * size is called the media block size.  The file system can set the block size
55 * it uses to a larger multiple of the media block size.  The driver must be
56 * able to handle buffers sizes larger than one media block.
57 *
58 * The user configures the amount of memory to be used as buffers in the cache,
59 * and the minimum and maximum buffer size.  The cache will allocate additional
60 * memory for the buffer descriptors and groups.  There are enough buffer
61 * descriptors allocated so all the buffer memory can be used as minimum sized
62 * buffers.
63 *
64 * The cache is a single pool of buffers.  The buffer memory is divided into
65 * groups where the size of buffer memory allocated to a group is the maximum
66 * buffer size.  A group's memory can be divided down into small buffer sizes
67 * that are a multiple of 2 of the minimum buffer size.  A group is the minimum
68 * allocation unit for buffers of a specific size.  If a buffer of maximum size
69 * is request the group will have a single buffer.  If a buffer of minimum size
70 * is requested the group is divided into minimum sized buffers and the
71 * remaining buffers are held ready for use.  A group keeps track of which
72 * buffers are with a file system or driver and groups who have buffer in use
73 * cannot be realloced.  Groups with no buffers in use can be taken and
74 * realloced to a new size.  This is how buffers of different sizes move around
75 * the cache.
76
77 * The buffers are held in various lists in the cache.  All buffers follow this
78 * state machine:
79 *
80 * @dot
81 * digraph state {
82 *   size="16,8";
83 *   f [label="FREE",style="filled",fillcolor="aquamarine"];
84 *   e [label="EMPTY",style="filled",fillcolor="seagreen"];
85 *   c [label="CACHED",style="filled",fillcolor="chartreuse"];
86 *   ac [label="ACCESS CACHED",style="filled",fillcolor="royalblue"];
87 *   am [label="ACCESS MODIFIED",style="filled",fillcolor="royalblue"];
88 *   ae [label="ACCESS EMPTY",style="filled",fillcolor="royalblue"];
89 *   t [label="TRANSFER",style="filled",fillcolor="red"];
90 *   s [label="SYNC",style="filled",fillcolor="red"];
91 *   m [label="MODIFIED",style="filled",fillcolor="gold"];
92 *   i [label="INITIAL"];
93 *
94 *   legend_transfer [label="Transfer Wake-Up",fontcolor="red",shape="none"];
95 *   legend_access [label="Access Wake-Up",fontcolor="royalblue",shape="none"];
96 *
97 *   i -> f [label="Init"];
98 *   f -> e [label="Buffer Recycle"];
99 *   e -> ae [label="Get"];
100 *   e -> t [label="Read\nRead Ahead"];
101 *   c -> f [label="Reallocate\nBlock Size Changed"];
102 *   c -> ac [label="Get\nRead"];
103 *   c -> e [label="Buffer Recycle"];
104 *   t -> c [label="Transfer Done",color="red",fontcolor="red"];
105 *   t -> e [label="Transfer Error With Waiter",color="red",fontcolor="red"];
106 *   t -> f [label="Transfer Error Without Waiter",color="red",fontcolor="red"];
107 *   m -> t [label="Swapout"];
108 *   m -> s [label="Block Size Changed"];
109 *   m -> am [label="Get\nRead"];
110 *   ac -> m [label="Release Modified",color="royalblue",fontcolor="royalblue"];
111 *   ac -> s [label="Sync",color="royalblue",fontcolor="royalblue"];
112 *   ac -> c [label="Release",color="royalblue",fontcolor="royalblue"];
113 *   am -> m [label="Release\nRelease Modified",color="royalblue",fontcolor="royalblue"];
114 *   am -> s [label="Sync",color="royalblue",fontcolor="royalblue"];
115 *   ae -> m [label="Release Modified",color="royalblue",fontcolor="royalblue"];
116 *   ae -> s [label="Sync",color="royalblue",fontcolor="royalblue"];
117 *   ae -> e [label="Release With Waiter",color="royalblue",fontcolor="royalblue"];
118 *   ae -> f [label="Release Without Waiter",color="royalblue",fontcolor="royalblue"];
119 *   s -> t [label="Swapout"];
120 * }
121 * @enddot
122 *
123 * Empty or cached buffers are added to the LRU list and removed from this
124 * queue when a caller requests a buffer.  This is referred to as getting a
125 * buffer in the code and the event get in the state diagram.  The buffer is
126 * assigned to a block and inserted to the AVL based on the block/device key.
127 * If the block is to be read by the user and not in the cache it is transfered
128 * from the disk into memory.  If no buffers are on the LRU list the modified
129 * list is checked.  If buffers are on the modified the swap out task will be
130 * woken.  The request blocks until a buffer is available for recycle.
131 *
132 * A block being accessed is given to the file system layer and not accessible
133 * to another requester until released back to the cache.  The same goes to a
134 * buffer in the transfer state.  The transfer state means being read or
135 * written.  If the file system has modifed the block and releases it as
136 * modified it placed on the cache's modified list and a hold timer
137 * initialised.  The buffer is held for the hold time before being written to
138 * disk.  Buffers are held for a configurable period of time on the modified
139 * list as a write sets the state to transfer and this locks the buffer out
140 * from the file system until the write completes.  Buffers are often accessed
141 * and modified in a series of small updates so if sent to the disk when
142 * released as modified the user would have to block waiting until it had been
143 * written.  This would be a performance problem.
144 *
145 * The code performs multiple block reads and writes.  Multiple block reads or
146 * read ahead increases performance with hardware that supports it.  It also
147 * helps with a large cache as the disk head movement is reduced.  It however
148 * is a speculative operation so excessive use can remove valuable and needed
149 * blocks from the cache.
150 *
151 * The cache has the following lists of buffers:
152 *  - LRU: Accessed or transfered buffers released in least recently used
153 *  order.  Empty buffers will be placed to the front.
154 *  - Modified: Buffers waiting to be written to disk.
155 *  - Sync: Buffers to be synchronized with the disk.
156 *
157 * A cache look-up will be performed to find a suitable buffer.  A suitable
158 * buffer is one that matches the same allocation size as the device the buffer
159 * is for.  The a buffer's group has no buffers in use with the file system or
160 * driver the group is reallocated.  This means the buffers in the group are
161 * invalidated, resized and placed on the LRU queue.  There is a performance
162 * issue with this design.  The reallocation of a group may forced recently
163 * accessed buffers out of the cache when they should not.  The design should be
164 * change to have groups on a LRU list if they have no buffers in use.
165 *
166 * @{
167 */
168
169/**
170 * @brief State of a buffer of the cache.
171 *
172 * The state has several implications.  Depending on the state a buffer can be
173 * in the AVL tree, in a list, in use by an entity and a group user or not.
174 *
175 * <table>
176 *   <tr>
177 *     <th>State</th><th>Valid Data</th><th>AVL Tree</th>
178 *     <th>LRU List</th><th>Modified List</th><th>Synchronization List</th>
179 *     <th>Group User</th><th>External User</th>
180 *   </tr>
181 *   <tr>
182 *     <td>FREE</td><td></td><td></td>
183 *     <td>X</td><td></td><td></td><td></td><td></td>
184 *   </tr>
185 *   <tr>
186 *     <td>EMPTY</td><td></td><td>X</td>
187 *     <td>X</td><td></td><td></td><td></td><td></td>
188 *   </tr>
189 *   <tr>
190 *     <td>CACHED</td><td>X</td><td>X</td>
191 *     <td>X</td><td></td><td></td><td></td><td></td>
192 *   </tr>
193 *   <tr>
194 *     <td>ACCESS_CACHED</td><td>X</td><td>X</td>
195 *     <td></td><td></td><td></td><td>X</td><td>X</td>
196 *   </tr>
197 *   <tr>
198 *     <td>ACCESS_MODIFIED</td><td>X</td><td>X</td>
199 *     <td></td><td></td><td></td><td>X</td><td>X</td>
200 *   </tr>
201 *   <tr>
202 *     <td>ACCESS_EMPTY</td><td></td><td>X</td>
203 *     <td></td><td></td><td></td><td>X</td><td>X</td>
204 *   </tr>
205 *   <tr>
206 *     <td>MODIFIED</td><td>X</td><td>X</td>
207 *     <td></td><td>X</td><td></td><td>X</td><td></td>
208 *   </tr>
209 *   <tr>
210 *     <td>SYNC</td><td>X</td><td>X</td>
211 *     <td></td><td></td><td>X</td><td>X</td><td></td>
212 *   </tr>
213 *   <tr>
214 *     <td>TRANSFER</td><td>X</td><td>X</td>
215 *     <td></td><td></td><td></td><td>X</td><td>X</td>
216 *   </tr>
217 * </table>
218 */
219typedef enum
220{
221  /**
222   * @brief Free.
223   */
224  RTEMS_BDBUF_STATE_FREE = 0,
225
226  /**
227   * @brief Empty.
228   */
229  RTEMS_BDBUF_STATE_EMPTY,
230
231  /**
232   * @brief Cached.
233   */
234  RTEMS_BDBUF_STATE_CACHED,
235
236  /**
237   * @brief Accessed by upper layer with cached data.
238   */
239  RTEMS_BDBUF_STATE_ACCESS_CACHED,
240
241  /**
242   * @brief Accessed by upper layer with modified data.
243   */
244  RTEMS_BDBUF_STATE_ACCESS_MODIFIED,
245
246  /**
247   * @brief Accessed by upper layer with invalid data.
248   */
249  RTEMS_BDBUF_STATE_ACCESS_EMPTY,
250
251  /**
252   * @brief Modified by upper layer.
253   */
254  RTEMS_BDBUF_STATE_MODIFIED,
255
256  /**
257   * @brief Scheduled for synchronization.
258   */
259  RTEMS_BDBUF_STATE_SYNC,
260
261  /**
262   * @brief In transfer by block device driver.
263   */
264  RTEMS_BDBUF_STATE_TRANSFER
265} rtems_bdbuf_buf_state;
266
267/**
268 * Forward reference to the block.
269 */
270struct rtems_bdbuf_group;
271typedef struct rtems_bdbuf_group rtems_bdbuf_group;
272
273/**
274 * To manage buffers we using buffer descriptors (BD). A BD holds a buffer plus
275 * a range of other information related to managing the buffer in the cache. To
276 * speed-up buffer lookup descriptors are organized in AVL-Tree. The fields
277 * 'dev' and 'block' are search keys.
278 */
279typedef struct rtems_bdbuf_buffer
280{
281  rtems_chain_node link;       /**< Link the BD onto a number of lists. */
282
283  struct rtems_bdbuf_avl_node
284  {
285    struct rtems_bdbuf_buffer* left;   /**< Left Child */
286    struct rtems_bdbuf_buffer* right;  /**< Right Child */
287    signed char                cache;  /**< Cache */
288    signed char                bal;    /**< The balance of the sub-tree */
289  } avl;
290
291  dev_t             dev;        /**< device number */
292
293  rtems_blkdev_bnum block;      /**< block number on the device */
294
295  unsigned char*    buffer;     /**< Pointer to the buffer memory area */
296
297  volatile rtems_bdbuf_buf_state state;  /**< State of the buffer. */
298
299  volatile uint32_t  waiters;    /**< The number of threads waiting on this
300                                  * buffer. */
301  rtems_bdbuf_group* group;      /**< Pointer to the group of BDs this BD is
302                                  * part of. */
303  volatile uint32_t  hold_timer; /**< Timer to indicate how long a buffer
304                                  * has been held in the cache modified. */
305
306  int   references;              /**< Allow reference counting by owner. */
307  void* user;                    /**< User data. */
308} rtems_bdbuf_buffer;
309
310/**
311 * A group is a continuous block of buffer descriptors. A group covers the
312 * maximum configured buffer size and is the allocation size for the buffers to
313 * a specific buffer size. If you allocate a buffer to be a specific size, all
314 * buffers in the group, if there are more than 1 will also be that size. The
315 * number of buffers in a group is a multiple of 2, ie 1, 2, 4, 8, etc.
316 */
317struct rtems_bdbuf_group
318{
319  rtems_chain_node    link;          /**< Link the groups on a LRU list if they
320                                      * have no buffers in use. */
321  size_t              bds_per_group; /**< The number of BD allocated to this
322                                      * group. This value must be a multiple of
323                                      * 2. */
324  uint32_t            users;         /**< How many users the block has. */
325  rtems_bdbuf_buffer* bdbuf;         /**< First BD this block covers. */
326};
327
328/**
329 * Buffering configuration definition. See confdefs.h for support on using this
330 * structure.
331 */
332typedef struct rtems_bdbuf_config {
333  uint32_t            max_read_ahead_blocks;   /**< Number of blocks to read
334                                                * ahead. */
335  uint32_t            max_write_blocks;        /**< Number of blocks to write
336                                                * at once. */
337  rtems_task_priority swapout_priority;        /**< Priority of the swap out
338                                                * task. */
339  uint32_t            swapout_period;          /**< Period swapout checks buf
340                                                * timers. */
341  uint32_t            swap_block_hold;         /**< Period a buffer is held. */
342  size_t              swapout_workers;         /**< The number of worker
343                                                * threads for the swapout
344                                                * task. */
345  rtems_task_priority swapout_worker_priority; /**< Priority of the swap out
346                                                * task. */
347  size_t              size;                    /**< Size of memory in the
348                                                * cache */
349  uint32_t            buffer_min;              /**< Minimum buffer size. */
350  uint32_t            buffer_max;              /**< Maximum buffer size
351                                                * supported. It is also the
352                                                * allocation size. */
353} rtems_bdbuf_config;
354
355/**
356 * External reference to the configuration.
357 *
358 * The configuration is provided by the application.
359 */
360extern const rtems_bdbuf_config rtems_bdbuf_configuration;
361
362/**
363 * The max_read_ahead_blocks value is altered if there are fewer buffers
364 * than this defined max. This stops thrashing in the cache.
365 */
366#define RTEMS_BDBUF_MAX_READ_AHEAD_BLOCKS_DEFAULT    0
367
368/**
369 * Default maximum number of blocks to write at once.
370 */
371#define RTEMS_BDBUF_MAX_WRITE_BLOCKS_DEFAULT         16
372
373/**
374 * Default swap-out task priority.
375 */
376#define RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT    15
377
378/**
379 * Default swap-out task swap period in milli seconds.
380 */
381#define RTEMS_BDBUF_SWAPOUT_TASK_SWAP_PERIOD_DEFAULT 250
382
383/**
384 * Default swap-out task block hold time in milli seconds.
385 */
386#define RTEMS_BDBUF_SWAPOUT_TASK_BLOCK_HOLD_DEFAULT  1000
387
388/**
389 * Default swap-out worker tasks. Currently disabled.
390 */
391#define RTEMS_BDBUF_SWAPOUT_WORKER_TASKS_DEFAULT     0
392
393/**
394 * Default swap-out worker task priority. The same as the swapout task.
395 */
396#define RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT \
397                             RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT
398
399/**
400 * Default size of memory allocated to the cache.
401 */
402#define RTEMS_BDBUF_CACHE_MEMORY_SIZE_DEFAULT (64 * 512)
403
404/**
405 * Default minimum size of buffers.
406 */
407#define RTEMS_BDBUF_BUFFER_MIN_SIZE_DEFAULT (512)
408
409/**
410 * Default maximum size of buffers.
411 */
412#define RTEMS_BDBUF_BUFFER_MAX_SIZE_DEFAULT (4096)
413
414/**
415 * Prepare buffering layer to work - initialize buffer descritors and (if it is
416 * neccessary) buffers. After initialization all blocks is placed into the
417 * ready state.
418 *
419 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
420 *         successfully or error code if error is occured)
421 */
422rtems_status_code
423rtems_bdbuf_init (void);
424
425/**
426 * Get block buffer for data to be written into. The buffers is set to the
427 * access or modifed access state. If the buffer is in the cache and modified
428 * the state is access modified else the state is access. This buffer contents
429 * are not initialised if the buffer is not already in the cache. If the block
430 * is already resident in memory it is returned how-ever if not in memory the
431 * buffer is not read from disk. This call is used when writing the whole block
432 * on a disk rather than just changing a part of it. If there is no buffers
433 * available this call will block. A buffer obtained with this call will not be
434 * involved in a transfer request and will not be returned to another user
435 * until released. If the buffer is already with a user when this call is made
436 * the call is blocked until the buffer is returned. The highest priority
437 * waiter will obtain the buffer first.
438 *
439 * The block number is the linear block number. This is relative to the start
440 * of the partition on the media.
441 *
442 * @param device Device number (constructed of major and minor device number)
443 * @param block  Linear media block number
444 * @param bd     Reference to the buffer descriptor pointer.
445 *
446 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
447 *               successfully or error code if error is occured)
448 */
449rtems_status_code
450rtems_bdbuf_get (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);
451
452/**
453 * Get the block buffer and if not already in the cache read from the disk. If
454 * specified block already cached return. The buffer is set to the access or
455 * modifed access state. If the buffer is in the cache and modified the state
456 * is access modified else the state is access. If block is already being read
457 * from disk for being written to disk this call blocks. If the buffer is
458 * waiting to be written it is removed from modified queue and returned to the
459 * user. If the buffer is not in the cache a new buffer is obtained and the
460 * data read from disk. The call may block until these operations complete. A
461 * buffer obtained with this call will not be involved in a transfer request
462 * and will not be returned to another user until released. If the buffer is
463 * already with a user when this call is made the call is blocked until the
464 * buffer is returned. The highest priority waiter will obtain the buffer
465 * first.
466 *
467 * @param device Device number (constructed of major and minor device number)
468 * @param block  Linear media block number
469 * @param bd     Reference to the buffer descriptor pointer.
470 *
471 * @return       RTEMS status code (RTEMS_SUCCESSFUL if operation completed
472 *               successfully or error code if error is occured)
473 */
474rtems_status_code
475rtems_bdbuf_read (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd);
476
477/**
478 * Release the buffer obtained by a read call back to the cache. If the buffer
479 * was obtained by a get call and was not already in the cache the release
480 * modified call should be used. A buffer released with this call obtained by a
481 * get call may not be in sync with the contents on disk. If the buffer was in
482 * the cache and modified before this call it will be returned to the modified
483 * queue. The buffers is returned to the end of the LRU list.
484 *
485 * @param bd Reference to the buffer descriptor.
486 *
487 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
488 *         successfully or error code if error is occured)
489 */
490rtems_status_code
491rtems_bdbuf_release (rtems_bdbuf_buffer* bd);
492
493/**
494 * Release the buffer allocated with a get or read call placing it on the
495 * modidied list.  If the buffer was not released modified before the hold
496 * timer is set to the configuration value. If the buffer had been released
497 * modified before but not written to disk the hold timer is not updated. The
498 * buffer will be written to disk when the hold timer has expired, there are
499 * not more buffers available in the cache and a get or read buffer needs one
500 * or a sync call has been made. If the buffer is obtained with a get or read
501 * before the hold timer has expired the buffer will be returned to the user.
502 *
503 * @param bd Reference to the buffer descriptor.
504 *
505 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
506 *         successfully or error code if error is occured)
507 */
508rtems_status_code
509rtems_bdbuf_release_modified (rtems_bdbuf_buffer* bd);
510
511/**
512 * Release the buffer as modified and wait until it has been synchronized with
513 * the disk by writing it. This buffer will be the first to be transfer to disk
514 * and other buffers may also be written if the maximum number of blocks in a
515 * requests allows it.
516 *
517 * @note This code does not lock the sync mutex and stop additions to the
518 *       modified queue.
519
520 * @param bd Reference to the buffer descriptor.
521 *
522 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
523 *         successfully or error code if error is occured)
524 */
525rtems_status_code
526rtems_bdbuf_sync (rtems_bdbuf_buffer* bd);
527
528/**
529 * Synchronize all modified buffers for this device with the disk and wait
530 * until the transfers have completed. The sync mutex for the cache is locked
531 * stopping the addition of any further modifed buffers. It is only the
532 * currently modified buffers that are written.
533 *
534 * @note Nesting calls to sync multiple devices will be handled sequentially. A
535 * nested call will be blocked until the first sync request has complete.
536 *
537 * @param dev Block device number
538 *
539 * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed
540 *         successfully or error code if error is occured)
541 */
542rtems_status_code
543rtems_bdbuf_syncdev (dev_t dev);
544
545/** @} */
546
547#ifdef __cplusplus
548}
549#endif
550
551#endif
Note: See TracBrowser for help on using the repository browser.