1 | /** |
---|
2 | * @file |
---|
3 | * |
---|
4 | * @ingroup rtems_bdbuf |
---|
5 | * |
---|
6 | * Block device buffer management. |
---|
7 | */ |
---|
8 | |
---|
9 | /* |
---|
10 | * Copyright (C) 2001 OKTET Ltd., St.-Petersburg, Russia |
---|
11 | * Author: Victor V. Vengerov <vvv@oktet.ru> |
---|
12 | * |
---|
13 | * Copyright (C) 2008,2009 Chris Johns <chrisj@rtems.org> |
---|
14 | * Rewritten to remove score mutex access. Fixes many performance |
---|
15 | * issues. |
---|
16 | Change to support demand driven variable buffer sizes. |
---|
17 | * |
---|
18 | * @(#) bdbuf.h,v 1.9 2005/02/02 00:06:18 joel Exp |
---|
19 | */ |
---|
20 | |
---|
21 | #ifndef _RTEMS_BDBUF_H |
---|
22 | #define _RTEMS_BDBUF_H |
---|
23 | |
---|
24 | #include <rtems.h> |
---|
25 | #include <rtems/libio.h> |
---|
26 | #include <rtems/chain.h> |
---|
27 | |
---|
28 | #include <rtems/blkdev.h> |
---|
29 | #include <rtems/diskdevs.h> |
---|
30 | |
---|
31 | #ifdef __cplusplus |
---|
32 | extern "C" { |
---|
33 | #endif |
---|
34 | |
---|
35 | /** |
---|
36 | * @defgroup rtems_libblock Block Device Library |
---|
37 | * |
---|
38 | * Block device modules. |
---|
39 | */ |
---|
40 | |
---|
41 | /** |
---|
42 | * @defgroup rtems_bdbuf Block Device Buffer Management |
---|
43 | * |
---|
44 | * @ingroup rtems_libblock |
---|
45 | * |
---|
46 | * The Block Device Buffer Management implements a cache between the disk |
---|
47 | * devices and file systems. The code provides read ahead and write queuing to |
---|
48 | * the drivers and fast cache look up using an AVL tree. |
---|
49 | * |
---|
50 | * The block size used by a file system can be set at runtime and must be a |
---|
51 | * multiple of the disk device block size. The disk device's physical block |
---|
52 | * size is called the media block size. The file system can set the block size |
---|
53 | * it uses to a larger multiple of the media block size. The driver must be |
---|
54 | * able to handle buffers sizes larger than one media block. |
---|
55 | * |
---|
56 | * The user configures the amount of memory to be used as buffers in the cache, |
---|
57 | * and the minimum and maximum buffer size. The cache will allocate additional |
---|
58 | * memory for the buffer descriptors and groups. There are enough buffer |
---|
59 | * descriptors allocated so all the buffer memory can be used as minimum sized |
---|
60 | * buffers. |
---|
61 | * |
---|
62 | * The cache is a single pool of buffers. The buffer memory is divided into |
---|
63 | * groups where the size of buffer memory allocated to a group is the maximum |
---|
64 | * buffer size. A group's memory can be divided down into small buffer sizes |
---|
65 | * that are a multiple of 2 of the minimum buffer size. A group is the minumum |
---|
66 | * allocation unit for buffers of a specific size. If a buffer of maximum size |
---|
67 | * is request the group will have a single buffer. If a buffer of minium size |
---|
68 | * is requested the group is divided into minimum sized buffers and the |
---|
69 | * remaining buffers are held ready for use. A group keeps track of which |
---|
70 | * buffers are with a file system or driver and groups who have buffer in use |
---|
71 | * cannot be realloced. Groups with no buffers in use can be taken and |
---|
72 | * realloced to a new size. This is how buffers of different sizes move around |
---|
73 | * the cache. |
---|
74 | |
---|
75 | * The buffers are held in various lists in the cache. All buffers follow this |
---|
76 | * state machine: |
---|
77 | * |
---|
78 | * @dot |
---|
79 | * digraph g { |
---|
80 | * ready [label="Ready\nRead Ahead"]; |
---|
81 | * transfer [label="Transfer"]; |
---|
82 | * accessed [label="Accessed\nAccessed Modified"]; |
---|
83 | * modified [label="Modified\nSynchronized"]; |
---|
84 | * cached [label="Cached"]; |
---|
85 | * ready -> transfer [label="Read\nRead Ahead"]; |
---|
86 | * transfer -> ready [label="Read Ahead Complete"]; |
---|
87 | * ready -> accessed [label="Get"]; |
---|
88 | * transfer -> accessed [label="Read or Write\nComplete"]; |
---|
89 | * transfer -> cached [label="Read or Write\nComplete"]; |
---|
90 | * accessed -> cached [label="Release"]; |
---|
91 | * cached -> accessed [label="Get"]; |
---|
92 | * modified -> accessed [label="Get"]; |
---|
93 | * accessed -> modified [label="Modified"]; |
---|
94 | * accessed -> transfer [label="Swap"]; |
---|
95 | * } |
---|
96 | * @enddot |
---|
97 | * |
---|
98 | * Empty buffers are added to the ready list and removed from this queue when a |
---|
99 | * caller requests a buffer. This is referred to as getting a buffer in the |
---|
100 | * code and the event get in the state diagram. The buffer is assigned to a |
---|
101 | * block and inserted to the AVL based on the block/device key. If the block is |
---|
102 | * to be read by the user and not in the cache (ready) it is transfered from |
---|
103 | * the disk into memory. If no ready buffers exist the buffer is taken from the |
---|
104 | * LRU list. If no buffers are on the LRU list the modified list is check. If |
---|
105 | * no buffers are on the modified list the request blocks. If buffers are on |
---|
106 | * the modified list the buffers hold timer is expired and the swap out task |
---|
107 | * woken. |
---|
108 | * |
---|
109 | * A block being accessed is given to the file system layer and not accessable |
---|
110 | * to another requester until released back to the cache. The same goes to a |
---|
111 | * buffer in the transfer state. The transfer state means being read or |
---|
112 | * written. If the file system has modifed the block and releases it as |
---|
113 | * modified it placed on the cache's modified list and a hold timer |
---|
114 | * initialised. The buffer is held for the hold time before being written to |
---|
115 | * disk. Buffers are held for a configurable period of time on the modified |
---|
116 | * list as a write sets the state to transfer and this locks the buffer out |
---|
117 | * from the file system until the write completes. Buffers are often accessed |
---|
118 | * and modified in a series of small updates so if sent to the disk when |
---|
119 | * released as modified the user would have to block waiting until it had been |
---|
120 | * written. This would be a performance problem. |
---|
121 | * |
---|
122 | * The code performs mulitple block reads and writes. Multiple block reads or |
---|
123 | * read ahead increases performance with hardware that supports it. It also |
---|
124 | * helps with a large cache as the disk head movement is reduced. It how-ever |
---|
125 | * is a speculative operation so excessive use can remove valuable and needed |
---|
126 | * blocks from the cache. The get call knows if a read is a for the file system |
---|
127 | * or if it is a read ahead get. If the get is for a read ahead block and the |
---|
128 | * block is already in the cache or no ready buffers are available the read |
---|
129 | * ahead is stopped. The transfer occurs with the blocks so far. If a buffer is |
---|
130 | * in the read ahead state and release it is placed on the ready list rather |
---|
131 | * than the LRU list. This means these buffers are used before buffers used by |
---|
132 | * the file system. |
---|
133 | * |
---|
134 | * The cache has the following lists of buffers: |
---|
135 | * - @c ready: Empty buffers created when the pool is initialised. |
---|
136 | * - @c modified: Buffers waiting to be written to disk. |
---|
137 | * - @c sync: Buffers to be synced to disk. |
---|
138 | * - @c lru: Accessed buffers released in least recently used order. |
---|
139 | * |
---|
140 | * The cache scans the ready list then the LRU list for a suitable buffer in |
---|
141 | * this order. A suitable buffer is one that matches the same allocation size |
---|
142 | * as the device the buffer is for. The a buffer's group has no buffers in use |
---|
143 | * with the file system or driver the group is reallocated. This means the |
---|
144 | * buffers in the group are invalidated, resized and placed on the ready queue. |
---|
145 | * There is a performance issue with this design. The reallocation of a group |
---|
146 | * may forced recently accessed buffers out of the cache when they should |
---|
147 | * not. The design should be change to have groups on a LRU list if they have |
---|
148 | * no buffers in use. |
---|
149 | * |
---|
150 | * @{ |
---|
151 | */ |
---|
152 | |
---|
153 | /** |
---|
154 | * State of a buffer in the cache. |
---|
155 | */ |
---|
156 | typedef enum |
---|
157 | { |
---|
158 | RTEMS_BDBUF_STATE_EMPTY = 0, /**< Not in use. */ |
---|
159 | RTEMS_BDBUF_STATE_READ_AHEAD = 1, /**< Holds read ahead data only */ |
---|
160 | RTEMS_BDBUF_STATE_CACHED = 2, /**< In the cache and available */ |
---|
161 | RTEMS_BDBUF_STATE_ACCESS = 3, /**< The user has the buffer */ |
---|
162 | RTEMS_BDBUF_STATE_MODIFIED = 4, /**< In the cache but modified */ |
---|
163 | RTEMS_BDBUF_STATE_ACCESS_MODIFIED = 5, /**< With the user but modified */ |
---|
164 | RTEMS_BDBUF_STATE_SYNC = 6, /**< Requested to be sync'ed */ |
---|
165 | RTEMS_BDBUF_STATE_TRANSFER = 7 /**< Being transferred to or from disk */ |
---|
166 | } rtems_bdbuf_buf_state; |
---|
167 | |
---|
168 | /** |
---|
169 | * Forward reference to the block. |
---|
170 | */ |
---|
171 | struct rtems_bdbuf_group; |
---|
172 | typedef struct rtems_bdbuf_group rtems_bdbuf_group; |
---|
173 | |
---|
174 | /** |
---|
175 | * To manage buffers we using buffer descriptors (BD). A BD holds a buffer plus |
---|
176 | * a range of other information related to managing the buffer in the cache. To |
---|
177 | * speed-up buffer lookup descriptors are organized in AVL-Tree. The fields |
---|
178 | * 'dev' and 'block' are search keys. |
---|
179 | */ |
---|
180 | typedef struct rtems_bdbuf_buffer |
---|
181 | { |
---|
182 | rtems_chain_node link; /**< Link the BD onto a number of lists. */ |
---|
183 | |
---|
184 | struct rtems_bdbuf_avl_node |
---|
185 | { |
---|
186 | struct rtems_bdbuf_buffer* left; /**< Left Child */ |
---|
187 | struct rtems_bdbuf_buffer* right; /**< Right Child */ |
---|
188 | signed char cache; /**< Cache */ |
---|
189 | signed char bal; /**< The balance of the sub-tree */ |
---|
190 | } avl; |
---|
191 | |
---|
192 | dev_t dev; /**< device number */ |
---|
193 | |
---|
194 | rtems_blkdev_bnum block; /**< block number on the device */ |
---|
195 | |
---|
196 | unsigned char* buffer; /**< Pointer to the buffer memory area */ |
---|
197 | int error; /**< If not 0 indicate an error value (errno) |
---|
198 | * which can be used by user later */ |
---|
199 | |
---|
200 | volatile rtems_bdbuf_buf_state state; /**< State of the buffer. */ |
---|
201 | |
---|
202 | volatile uint32_t waiters; /**< The number of threads waiting on this |
---|
203 | * buffer. */ |
---|
204 | rtems_bdbuf_group* group; /**< Pointer to the group of BDs this BD is |
---|
205 | * part of. */ |
---|
206 | volatile uint32_t hold_timer; /**< Timer to indicate how long a buffer |
---|
207 | * has been held in the cache modified. */ |
---|
208 | |
---|
209 | int references; /**< Allow reference counting by owner. */ |
---|
210 | void* user; /**< User data. */ |
---|
211 | } rtems_bdbuf_buffer; |
---|
212 | |
---|
213 | /** |
---|
214 | * A group is a continuous block of buffer descriptors. A group covers the |
---|
215 | * maximum configured buffer size and is the allocation size for the buffers to |
---|
216 | * a specific buffer size. If you allocate a buffer to be a specific size, all |
---|
217 | * buffers in the group, if there are more than 1 will also be that size. The |
---|
218 | * number of buffers in a group is a multiple of 2, ie 1, 2, 4, 8, etc. |
---|
219 | */ |
---|
220 | struct rtems_bdbuf_group |
---|
221 | { |
---|
222 | rtems_chain_node link; /**< Link the groups on a LRU list if they |
---|
223 | * have no buffers in use. */ |
---|
224 | size_t bds_per_group; /**< The number of BD allocated to this |
---|
225 | * group. This value must be a multiple of |
---|
226 | * 2. */ |
---|
227 | uint32_t users; /**< How many users the block has. */ |
---|
228 | rtems_bdbuf_buffer* bdbuf; /**< First BD this block covers. */ |
---|
229 | }; |
---|
230 | |
---|
231 | /** |
---|
232 | * Buffering configuration definition. See confdefs.h for support on using this |
---|
233 | * structure. |
---|
234 | */ |
---|
235 | typedef struct rtems_bdbuf_config { |
---|
236 | uint32_t max_read_ahead_blocks; /**< Number of blocks to read |
---|
237 | * ahead. */ |
---|
238 | uint32_t max_write_blocks; /**< Number of blocks to write |
---|
239 | * at once. */ |
---|
240 | rtems_task_priority swapout_priority; /**< Priority of the swap out |
---|
241 | * task. */ |
---|
242 | uint32_t swapout_period; /**< Period swapout checks buf |
---|
243 | * timers. */ |
---|
244 | uint32_t swap_block_hold; /**< Period a buffer is held. */ |
---|
245 | size_t swapout_workers; /**< The number of worker |
---|
246 | * threads for the swapout |
---|
247 | * task. */ |
---|
248 | rtems_task_priority swapout_worker_priority; /**< Priority of the swap out |
---|
249 | * task. */ |
---|
250 | size_t size; /**< Size of memory in the |
---|
251 | * cache */ |
---|
252 | uint32_t buffer_min; /**< Minimum buffer size. */ |
---|
253 | uint32_t buffer_max; /**< Maximum buffer size |
---|
254 | * supported. It is also the |
---|
255 | * allocation size. */ |
---|
256 | } rtems_bdbuf_config; |
---|
257 | |
---|
258 | /** |
---|
259 | * External reference to the configuration. |
---|
260 | * |
---|
261 | * The configuration is provided by the application. |
---|
262 | */ |
---|
263 | extern const rtems_bdbuf_config rtems_bdbuf_configuration; |
---|
264 | |
---|
265 | /** |
---|
266 | * The max_read_ahead_blocks value is altered if there are fewer buffers |
---|
267 | * than this defined max. This stops thrashing in the cache. |
---|
268 | */ |
---|
269 | #define RTEMS_BDBUF_MAX_READ_AHEAD_BLOCKS_DEFAULT 32 |
---|
270 | |
---|
271 | /** |
---|
272 | * Default maximum number of blocks to write at once. |
---|
273 | */ |
---|
274 | #define RTEMS_BDBUF_MAX_WRITE_BLOCKS_DEFAULT 16 |
---|
275 | |
---|
276 | /** |
---|
277 | * Default swap-out task priority. |
---|
278 | */ |
---|
279 | #define RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT 15 |
---|
280 | |
---|
281 | /** |
---|
282 | * Default swap-out task swap period in milli seconds. |
---|
283 | */ |
---|
284 | #define RTEMS_BDBUF_SWAPOUT_TASK_SWAP_PERIOD_DEFAULT 250 |
---|
285 | |
---|
286 | /** |
---|
287 | * Default swap-out task block hold time in milli seconds. |
---|
288 | */ |
---|
289 | #define RTEMS_BDBUF_SWAPOUT_TASK_BLOCK_HOLD_DEFAULT 1000 |
---|
290 | |
---|
291 | /** |
---|
292 | * Default swap-out worker tasks. Currently disabled. |
---|
293 | */ |
---|
294 | #define RTEMS_BDBUF_SWAPOUT_WORKER_TASKS_DEFAULT 0 |
---|
295 | |
---|
296 | /** |
---|
297 | * Default swap-out worker task priority. The same as the swapout task. |
---|
298 | */ |
---|
299 | #define RTEMS_BDBUF_SWAPOUT_WORKER_TASK_PRIORITY_DEFAULT \ |
---|
300 | RTEMS_BDBUF_SWAPOUT_TASK_PRIORITY_DEFAULT |
---|
301 | |
---|
302 | /** |
---|
303 | * Default size of memory allocated to the cache. |
---|
304 | */ |
---|
305 | #define RTEMS_BDBUF_CACHE_MEMORY_SIZE_DEFAULT (64 * 512) |
---|
306 | |
---|
307 | /** |
---|
308 | * Default minimum size of buffers. |
---|
309 | */ |
---|
310 | #define RTEMS_BDBUF_BUFFER_MIN_SIZE_DEFAULT (512) |
---|
311 | |
---|
312 | /** |
---|
313 | * Default maximum size of buffers. |
---|
314 | */ |
---|
315 | #define RTEMS_BDBUF_BUFFER_MAX_SIZE_DEFAULT (4096) |
---|
316 | |
---|
317 | /** |
---|
318 | * Prepare buffering layer to work - initialize buffer descritors and (if it is |
---|
319 | * neccessary) buffers. After initialization all blocks is placed into the |
---|
320 | * ready state. |
---|
321 | * |
---|
322 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
323 | * successfully or error code if error is occured) |
---|
324 | */ |
---|
325 | rtems_status_code |
---|
326 | rtems_bdbuf_init (void); |
---|
327 | |
---|
328 | /** |
---|
329 | * Get block buffer for data to be written into. The buffers is set to the |
---|
330 | * access or modifed access state. If the buffer is in the cache and modified |
---|
331 | * the state is access modified else the state is access. This buffer contents |
---|
332 | * are not initialised if the buffer is not already in the cache. If the block |
---|
333 | * is already resident in memory it is returned how-ever if not in memory the |
---|
334 | * buffer is not read from disk. This call is used when writing the whole block |
---|
335 | * on a disk rather than just changing a part of it. If there is no buffers |
---|
336 | * available this call will block. A buffer obtained with this call will not be |
---|
337 | * involved in a transfer request and will not be returned to another user |
---|
338 | * until released. If the buffer is already with a user when this call is made |
---|
339 | * the call is blocked until the buffer is returned. The highest priority |
---|
340 | * waiter will obtain the buffer first. |
---|
341 | * |
---|
342 | * The block number is the linear block number. This is relative to the start |
---|
343 | * of the partition on the media. |
---|
344 | * |
---|
345 | * @param device Device number (constructed of major and minor device number) |
---|
346 | * @param block Linear media block number |
---|
347 | * @param bd Reference to the buffer descriptor pointer. |
---|
348 | * |
---|
349 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
350 | * successfully or error code if error is occured) |
---|
351 | */ |
---|
352 | rtems_status_code |
---|
353 | rtems_bdbuf_get (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd); |
---|
354 | |
---|
355 | /** |
---|
356 | * Get the block buffer and if not already in the cache read from the disk. If |
---|
357 | * specified block already cached return. The buffer is set to the access or |
---|
358 | * modifed access state. If the buffer is in the cache and modified the state |
---|
359 | * is access modified else the state is access. If block is already being read |
---|
360 | * from disk for being written to disk this call blocks. If the buffer is |
---|
361 | * waiting to be written it is removed from modified queue and returned to the |
---|
362 | * user. If the buffer is not in the cache a new buffer is obtained and the |
---|
363 | * data read from disk. The call may block until these operations complete. A |
---|
364 | * buffer obtained with this call will not be involved in a transfer request |
---|
365 | * and will not be returned to another user until released. If the buffer is |
---|
366 | * already with a user when this call is made the call is blocked until the |
---|
367 | * buffer is returned. The highest priority waiter will obtain the buffer |
---|
368 | * first. |
---|
369 | * |
---|
370 | * @param device Device number (constructed of major and minor device number) |
---|
371 | * @param block Linear media block number |
---|
372 | * @param bd Reference to the buffer descriptor pointer. |
---|
373 | * |
---|
374 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
375 | * successfully or error code if error is occured) |
---|
376 | */ |
---|
377 | rtems_status_code |
---|
378 | rtems_bdbuf_read (dev_t device, rtems_blkdev_bnum block, rtems_bdbuf_buffer** bd); |
---|
379 | |
---|
380 | /** |
---|
381 | * Release the buffer obtained by a read call back to the cache. If the buffer |
---|
382 | * was obtained by a get call and was not already in the cache the release |
---|
383 | * modified call should be used. A buffer released with this call obtained by a |
---|
384 | * get call may not be in sync with the contents on disk. If the buffer was in |
---|
385 | * the cache and modified before this call it will be returned to the modified |
---|
386 | * queue. The buffers is returned to the end of the LRU list. |
---|
387 | * |
---|
388 | * @param bd Reference to the buffer descriptor. |
---|
389 | * |
---|
390 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
391 | * successfully or error code if error is occured) |
---|
392 | */ |
---|
393 | rtems_status_code |
---|
394 | rtems_bdbuf_release (rtems_bdbuf_buffer* bd); |
---|
395 | |
---|
396 | /** |
---|
397 | * Release the buffer allocated with a get or read call placing it on the |
---|
398 | * modidied list. If the buffer was not released modified before the hold |
---|
399 | * timer is set to the configuration value. If the buffer had been released |
---|
400 | * modified before but not written to disk the hold timer is not updated. The |
---|
401 | * buffer will be written to disk when the hold timer has expired, there are |
---|
402 | * not more buffers available in the cache and a get or read buffer needs one |
---|
403 | * or a sync call has been made. If the buffer is obtained with a get or read |
---|
404 | * before the hold timer has expired the buffer will be returned to the user. |
---|
405 | * |
---|
406 | * @param bd Reference to the buffer descriptor. |
---|
407 | * |
---|
408 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
409 | * successfully or error code if error is occured) |
---|
410 | */ |
---|
411 | rtems_status_code |
---|
412 | rtems_bdbuf_release_modified (rtems_bdbuf_buffer* bd); |
---|
413 | |
---|
414 | /** |
---|
415 | * Release the buffer as modified and wait until it has been synchronized with |
---|
416 | * the disk by writing it. This buffer will be the first to be transfer to disk |
---|
417 | * and other buffers may also be written if the maximum number of blocks in a |
---|
418 | * requests allows it. |
---|
419 | * |
---|
420 | * @note This code does not lock the sync mutex and stop additions to the |
---|
421 | * modified queue. |
---|
422 | |
---|
423 | * @param bd Reference to the buffer descriptor. |
---|
424 | * |
---|
425 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
426 | * successfully or error code if error is occured) |
---|
427 | */ |
---|
428 | rtems_status_code |
---|
429 | rtems_bdbuf_sync (rtems_bdbuf_buffer* bd); |
---|
430 | |
---|
431 | /** |
---|
432 | * Synchronize all modified buffers for this device with the disk and wait |
---|
433 | * until the transfers have completed. The sync mutex for the cache is locked |
---|
434 | * stopping the addition of any further modifed buffers. It is only the |
---|
435 | * currently modified buffers that are written. |
---|
436 | * |
---|
437 | * @note Nesting calls to sync multiple devices will be handled sequentially. A |
---|
438 | * nested call will be blocked until the first sync request has complete. |
---|
439 | * |
---|
440 | * @param dev Block device number |
---|
441 | * |
---|
442 | * @return RTEMS status code (RTEMS_SUCCESSFUL if operation completed |
---|
443 | * successfully or error code if error is occured) |
---|
444 | */ |
---|
445 | rtems_status_code |
---|
446 | rtems_bdbuf_syncdev (dev_t dev); |
---|
447 | |
---|
448 | /** @} */ |
---|
449 | |
---|
450 | #ifdef __cplusplus |
---|
451 | } |
---|
452 | #endif |
---|
453 | |
---|
454 | #endif |
---|