1 | /*- |
---|
2 | * Copyright (c) 1990, 1993, 1994 |
---|
3 | * The Regents of the University of California. All rights reserved. |
---|
4 | * |
---|
5 | * This code is derived from software contributed to Berkeley by |
---|
6 | * Mike Olson. |
---|
7 | * |
---|
8 | * Redistribution and use in source and binary forms, with or without |
---|
9 | * modification, are permitted provided that the following conditions |
---|
10 | * are met: |
---|
11 | * 1. Redistributions of source code must retain the above copyright |
---|
12 | * notice, this list of conditions and the following disclaimer. |
---|
13 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
14 | * notice, this list of conditions and the following disclaimer in the |
---|
15 | * documentation and/or other materials provided with the distribution. |
---|
16 | * 4. Neither the name of the University nor the names of its contributors |
---|
17 | * may be used to endorse or promote products derived from this software |
---|
18 | * without specific prior written permission. |
---|
19 | * |
---|
20 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
---|
21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
---|
24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
30 | * SUCH DAMAGE. |
---|
31 | */ |
---|
32 | |
---|
33 | #if defined(LIBC_SCCS) && !defined(lint) |
---|
34 | static char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; |
---|
35 | #endif /* LIBC_SCCS and not lint */ |
---|
36 | #include <sys/cdefs.h> |
---|
37 | __FBSDID("$FreeBSD$"); |
---|
38 | |
---|
39 | /* |
---|
40 | * Implementation of btree access method for 4.4BSD. |
---|
41 | * |
---|
42 | * The design here was originally based on that of the btree access method |
---|
43 | * used in the Postgres database system at UC Berkeley. This implementation |
---|
44 | * is wholly independent of the Postgres code. |
---|
45 | */ |
---|
46 | |
---|
47 | #include "namespace.h" |
---|
48 | #include <rtems/bsd/sys/param.h> |
---|
49 | #include <sys/stat.h> |
---|
50 | |
---|
51 | #include <errno.h> |
---|
52 | #include <fcntl.h> |
---|
53 | #include <limits.h> |
---|
54 | #include <signal.h> |
---|
55 | #include <stdio.h> |
---|
56 | #include <stdlib.h> |
---|
57 | #include <string.h> |
---|
58 | #include <unistd.h> |
---|
59 | #include "un-namespace.h" |
---|
60 | |
---|
61 | #include <db.h> |
---|
62 | #include "btree.h" |
---|
63 | |
---|
64 | #ifdef DEBUG |
---|
65 | #undef MINPSIZE |
---|
66 | #define MINPSIZE 128 |
---|
67 | #endif |
---|
68 | |
---|
69 | static int byteorder(void); |
---|
70 | static int nroot(BTREE *); |
---|
71 | static int tmp(void); |
---|
72 | |
---|
73 | /* |
---|
74 | * __BT_OPEN -- Open a btree. |
---|
75 | * |
---|
76 | * Creates and fills a DB struct, and calls the routine that actually |
---|
77 | * opens the btree. |
---|
78 | * |
---|
79 | * Parameters: |
---|
80 | * fname: filename (NULL for in-memory trees) |
---|
81 | * flags: open flag bits |
---|
82 | * mode: open permission bits |
---|
83 | * b: BTREEINFO pointer |
---|
84 | * |
---|
85 | * Returns: |
---|
86 | * NULL on failure, pointer to DB on success. |
---|
87 | * |
---|
88 | */ |
---|
89 | DB * |
---|
90 | __bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags) |
---|
91 | { |
---|
92 | struct stat sb; |
---|
93 | BTMETA m; |
---|
94 | BTREE *t; |
---|
95 | BTREEINFO b; |
---|
96 | DB *dbp; |
---|
97 | pgno_t ncache; |
---|
98 | ssize_t nr; |
---|
99 | int machine_lorder, saved_errno; |
---|
100 | |
---|
101 | t = NULL; |
---|
102 | |
---|
103 | /* |
---|
104 | * Intention is to make sure all of the user's selections are okay |
---|
105 | * here and then use them without checking. Can't be complete, since |
---|
106 | * we don't know the right page size, lorder or flags until the backing |
---|
107 | * file is opened. Also, the file's page size can cause the cachesize |
---|
108 | * to change. |
---|
109 | */ |
---|
110 | machine_lorder = byteorder(); |
---|
111 | if (openinfo) { |
---|
112 | b = *openinfo; |
---|
113 | |
---|
114 | /* Flags: R_DUP. */ |
---|
115 | if (b.flags & ~(R_DUP)) |
---|
116 | goto einval; |
---|
117 | |
---|
118 | /* |
---|
119 | * Page size must be indx_t aligned and >= MINPSIZE. Default |
---|
120 | * page size is set farther on, based on the underlying file |
---|
121 | * transfer size. |
---|
122 | */ |
---|
123 | if (b.psize && |
---|
124 | (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || |
---|
125 | b.psize & (sizeof(indx_t) - 1) )) |
---|
126 | goto einval; |
---|
127 | |
---|
128 | /* Minimum number of keys per page; absolute minimum is 2. */ |
---|
129 | if (b.minkeypage) { |
---|
130 | if (b.minkeypage < 2) |
---|
131 | goto einval; |
---|
132 | } else |
---|
133 | b.minkeypage = DEFMINKEYPAGE; |
---|
134 | |
---|
135 | /* If no comparison, use default comparison and prefix. */ |
---|
136 | if (b.compare == NULL) { |
---|
137 | b.compare = __bt_defcmp; |
---|
138 | if (b.prefix == NULL) |
---|
139 | b.prefix = __bt_defpfx; |
---|
140 | } |
---|
141 | |
---|
142 | if (b.lorder == 0) |
---|
143 | b.lorder = machine_lorder; |
---|
144 | } else { |
---|
145 | b.compare = __bt_defcmp; |
---|
146 | b.cachesize = 0; |
---|
147 | b.flags = 0; |
---|
148 | b.lorder = machine_lorder; |
---|
149 | b.minkeypage = DEFMINKEYPAGE; |
---|
150 | b.prefix = __bt_defpfx; |
---|
151 | b.psize = 0; |
---|
152 | } |
---|
153 | |
---|
154 | /* Check for the ubiquitous PDP-11. */ |
---|
155 | if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) |
---|
156 | goto einval; |
---|
157 | |
---|
158 | /* Allocate and initialize DB and BTREE structures. */ |
---|
159 | if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL) |
---|
160 | goto err; |
---|
161 | t->bt_fd = -1; /* Don't close unopened fd on error. */ |
---|
162 | t->bt_lorder = b.lorder; |
---|
163 | t->bt_order = NOT; |
---|
164 | t->bt_cmp = b.compare; |
---|
165 | t->bt_pfx = b.prefix; |
---|
166 | t->bt_rfd = -1; |
---|
167 | |
---|
168 | if ((t->bt_dbp = dbp = (DB *)calloc(1, sizeof(DB))) == NULL) |
---|
169 | goto err; |
---|
170 | if (t->bt_lorder != machine_lorder) |
---|
171 | F_SET(t, B_NEEDSWAP); |
---|
172 | |
---|
173 | dbp->type = DB_BTREE; |
---|
174 | dbp->internal = t; |
---|
175 | dbp->close = __bt_close; |
---|
176 | dbp->del = __bt_delete; |
---|
177 | dbp->fd = __bt_fd; |
---|
178 | dbp->get = __bt_get; |
---|
179 | dbp->put = __bt_put; |
---|
180 | dbp->seq = __bt_seq; |
---|
181 | dbp->sync = __bt_sync; |
---|
182 | |
---|
183 | /* |
---|
184 | * If no file name was supplied, this is an in-memory btree and we |
---|
185 | * open a backing temporary file. Otherwise, it's a disk-based tree. |
---|
186 | */ |
---|
187 | if (fname) { |
---|
188 | switch (flags & O_ACCMODE) { |
---|
189 | case O_RDONLY: |
---|
190 | F_SET(t, B_RDONLY); |
---|
191 | break; |
---|
192 | case O_RDWR: |
---|
193 | break; |
---|
194 | case O_WRONLY: |
---|
195 | default: |
---|
196 | goto einval; |
---|
197 | } |
---|
198 | |
---|
199 | if ((t->bt_fd = _open(fname, flags, mode)) < 0) |
---|
200 | goto err; |
---|
201 | |
---|
202 | } else { |
---|
203 | if ((flags & O_ACCMODE) != O_RDWR) |
---|
204 | goto einval; |
---|
205 | if ((t->bt_fd = tmp()) == -1) |
---|
206 | goto err; |
---|
207 | F_SET(t, B_INMEM); |
---|
208 | } |
---|
209 | |
---|
210 | if (_fcntl(t->bt_fd, F_SETFD, 1) == -1) |
---|
211 | goto err; |
---|
212 | |
---|
213 | if (_fstat(t->bt_fd, &sb)) |
---|
214 | goto err; |
---|
215 | if (sb.st_size) { |
---|
216 | if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0) |
---|
217 | goto err; |
---|
218 | if (nr != sizeof(BTMETA)) |
---|
219 | goto eftype; |
---|
220 | |
---|
221 | /* |
---|
222 | * Read in the meta-data. This can change the notion of what |
---|
223 | * the lorder, page size and flags are, and, when the page size |
---|
224 | * changes, the cachesize value can change too. If the user |
---|
225 | * specified the wrong byte order for an existing database, we |
---|
226 | * don't bother to return an error, we just clear the NEEDSWAP |
---|
227 | * bit. |
---|
228 | */ |
---|
229 | if (m.magic == BTREEMAGIC) |
---|
230 | F_CLR(t, B_NEEDSWAP); |
---|
231 | else { |
---|
232 | F_SET(t, B_NEEDSWAP); |
---|
233 | M_32_SWAP(m.magic); |
---|
234 | M_32_SWAP(m.version); |
---|
235 | M_32_SWAP(m.psize); |
---|
236 | M_32_SWAP(m.free); |
---|
237 | M_32_SWAP(m.nrecs); |
---|
238 | M_32_SWAP(m.flags); |
---|
239 | } |
---|
240 | if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) |
---|
241 | goto eftype; |
---|
242 | if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || |
---|
243 | m.psize & (sizeof(indx_t) - 1) ) |
---|
244 | goto eftype; |
---|
245 | if (m.flags & ~SAVEMETA) |
---|
246 | goto eftype; |
---|
247 | b.psize = m.psize; |
---|
248 | F_SET(t, m.flags); |
---|
249 | t->bt_free = m.free; |
---|
250 | t->bt_nrecs = m.nrecs; |
---|
251 | } else { |
---|
252 | /* |
---|
253 | * Set the page size to the best value for I/O to this file. |
---|
254 | * Don't overflow the page offset type. |
---|
255 | */ |
---|
256 | if (b.psize == 0) { |
---|
257 | b.psize = sb.st_blksize; |
---|
258 | if (b.psize < MINPSIZE) |
---|
259 | b.psize = MINPSIZE; |
---|
260 | if (b.psize > MAX_PAGE_OFFSET + 1) |
---|
261 | b.psize = MAX_PAGE_OFFSET + 1; |
---|
262 | } |
---|
263 | |
---|
264 | /* Set flag if duplicates permitted. */ |
---|
265 | if (!(b.flags & R_DUP)) |
---|
266 | F_SET(t, B_NODUPS); |
---|
267 | |
---|
268 | t->bt_free = P_INVALID; |
---|
269 | t->bt_nrecs = 0; |
---|
270 | F_SET(t, B_METADIRTY); |
---|
271 | } |
---|
272 | |
---|
273 | t->bt_psize = b.psize; |
---|
274 | |
---|
275 | /* Set the cache size; must be a multiple of the page size. */ |
---|
276 | if (b.cachesize && b.cachesize & (b.psize - 1) ) |
---|
277 | b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1; |
---|
278 | if (b.cachesize < b.psize * MINCACHE) |
---|
279 | b.cachesize = b.psize * MINCACHE; |
---|
280 | |
---|
281 | /* Calculate number of pages to cache. */ |
---|
282 | ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; |
---|
283 | |
---|
284 | /* |
---|
285 | * The btree data structure requires that at least two keys can fit on |
---|
286 | * a page, but other than that there's no fixed requirement. The user |
---|
287 | * specified a minimum number per page, and we translated that into the |
---|
288 | * number of bytes a key/data pair can use before being placed on an |
---|
289 | * overflow page. This calculation includes the page header, the size |
---|
290 | * of the index referencing the leaf item and the size of the leaf item |
---|
291 | * structure. Also, don't let the user specify a minkeypage such that |
---|
292 | * a key/data pair won't fit even if both key and data are on overflow |
---|
293 | * pages. |
---|
294 | */ |
---|
295 | t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - |
---|
296 | (sizeof(indx_t) + NBLEAFDBT(0, 0)); |
---|
297 | if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) |
---|
298 | t->bt_ovflsize = |
---|
299 | NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); |
---|
300 | |
---|
301 | /* Initialize the buffer pool. */ |
---|
302 | if ((t->bt_mp = |
---|
303 | mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) |
---|
304 | goto err; |
---|
305 | if (!F_ISSET(t, B_INMEM)) |
---|
306 | mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); |
---|
307 | |
---|
308 | /* Create a root page if new tree. */ |
---|
309 | if (nroot(t) == RET_ERROR) |
---|
310 | goto err; |
---|
311 | |
---|
312 | /* Global flags. */ |
---|
313 | if (dflags & DB_LOCK) |
---|
314 | F_SET(t, B_DB_LOCK); |
---|
315 | if (dflags & DB_SHMEM) |
---|
316 | F_SET(t, B_DB_SHMEM); |
---|
317 | if (dflags & DB_TXN) |
---|
318 | F_SET(t, B_DB_TXN); |
---|
319 | |
---|
320 | return (dbp); |
---|
321 | |
---|
322 | einval: errno = EINVAL; |
---|
323 | goto err; |
---|
324 | |
---|
325 | eftype: errno = EFTYPE; |
---|
326 | goto err; |
---|
327 | |
---|
328 | err: saved_errno = errno; |
---|
329 | if (t) { |
---|
330 | if (t->bt_dbp) |
---|
331 | free(t->bt_dbp); |
---|
332 | if (t->bt_fd != -1) |
---|
333 | (void)_close(t->bt_fd); |
---|
334 | free(t); |
---|
335 | } |
---|
336 | errno = saved_errno; |
---|
337 | return (NULL); |
---|
338 | } |
---|
339 | |
---|
340 | /* |
---|
341 | * NROOT -- Create the root of a new tree. |
---|
342 | * |
---|
343 | * Parameters: |
---|
344 | * t: tree |
---|
345 | * |
---|
346 | * Returns: |
---|
347 | * RET_ERROR, RET_SUCCESS |
---|
348 | */ |
---|
349 | static int |
---|
350 | nroot(BTREE *t) |
---|
351 | { |
---|
352 | PAGE *meta, *root; |
---|
353 | pgno_t npg; |
---|
354 | |
---|
355 | if ((root = mpool_get(t->bt_mp, 1, 0)) != NULL) { |
---|
356 | if (root->lower == 0 && |
---|
357 | root->pgno == 0 && |
---|
358 | root->linp[0] == 0) { |
---|
359 | mpool_delete(t->bt_mp, root); |
---|
360 | errno = EINVAL; |
---|
361 | } else { |
---|
362 | mpool_put(t->bt_mp, root, 0); |
---|
363 | return (RET_SUCCESS); |
---|
364 | } |
---|
365 | } |
---|
366 | if (errno != EINVAL) /* It's OK to not exist. */ |
---|
367 | return (RET_ERROR); |
---|
368 | errno = 0; |
---|
369 | |
---|
370 | if ((meta = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) |
---|
371 | return (RET_ERROR); |
---|
372 | |
---|
373 | if ((root = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) |
---|
374 | return (RET_ERROR); |
---|
375 | |
---|
376 | if (npg != P_ROOT) |
---|
377 | return (RET_ERROR); |
---|
378 | root->pgno = npg; |
---|
379 | root->prevpg = root->nextpg = P_INVALID; |
---|
380 | root->lower = BTDATAOFF; |
---|
381 | root->upper = t->bt_psize; |
---|
382 | root->flags = P_BLEAF; |
---|
383 | memset(meta, 0, t->bt_psize); |
---|
384 | mpool_put(t->bt_mp, meta, MPOOL_DIRTY); |
---|
385 | mpool_put(t->bt_mp, root, MPOOL_DIRTY); |
---|
386 | return (RET_SUCCESS); |
---|
387 | } |
---|
388 | |
---|
389 | static int |
---|
390 | tmp(void) |
---|
391 | { |
---|
392 | sigset_t set, oset; |
---|
393 | int fd, len; |
---|
394 | char *envtmp = NULL; |
---|
395 | char path[MAXPATHLEN]; |
---|
396 | |
---|
397 | if (issetugid() == 0) |
---|
398 | envtmp = getenv("TMPDIR"); |
---|
399 | len = snprintf(path, |
---|
400 | sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp"); |
---|
401 | if (len < 0 || len >= (int)sizeof(path)) { |
---|
402 | errno = ENAMETOOLONG; |
---|
403 | return(-1); |
---|
404 | } |
---|
405 | |
---|
406 | (void)sigfillset(&set); |
---|
407 | (void)_sigprocmask(SIG_BLOCK, &set, &oset); |
---|
408 | if ((fd = mkstemp(path)) != -1) |
---|
409 | (void)unlink(path); |
---|
410 | (void)_sigprocmask(SIG_SETMASK, &oset, NULL); |
---|
411 | return(fd); |
---|
412 | } |
---|
413 | |
---|
414 | static int |
---|
415 | byteorder(void) |
---|
416 | { |
---|
417 | u_int32_t x; |
---|
418 | u_char *p; |
---|
419 | |
---|
420 | x = 0x01020304; |
---|
421 | p = (u_char *)&x; |
---|
422 | switch (*p) { |
---|
423 | case 1: |
---|
424 | return (BIG_ENDIAN); |
---|
425 | case 4: |
---|
426 | return (LITTLE_ENDIAN); |
---|
427 | default: |
---|
428 | return (0); |
---|
429 | } |
---|
430 | } |
---|
431 | |
---|
432 | int |
---|
433 | __bt_fd(const DB *dbp) |
---|
434 | { |
---|
435 | BTREE *t; |
---|
436 | |
---|
437 | t = dbp->internal; |
---|
438 | |
---|
439 | /* Toss any page pinned across calls. */ |
---|
440 | if (t->bt_pinned != NULL) { |
---|
441 | mpool_put(t->bt_mp, t->bt_pinned, 0); |
---|
442 | t->bt_pinned = NULL; |
---|
443 | } |
---|
444 | |
---|
445 | /* In-memory database can't have a file descriptor. */ |
---|
446 | if (F_ISSET(t, B_INMEM)) { |
---|
447 | errno = ENOENT; |
---|
448 | return (-1); |
---|
449 | } |
---|
450 | return (t->bt_fd); |
---|
451 | } |
---|