1 | #include <machine/rtems-bsd-kernel-space.h> |
---|
2 | |
---|
3 | /*- |
---|
4 | * SPDX-License-Identifier: BSD-3-Clause |
---|
5 | * |
---|
6 | * Copyright (c) 1982, 1986, 1989, 1993 |
---|
7 | * The Regents of the University of California. All rights reserved. |
---|
8 | * (c) UNIX System Laboratories, Inc. |
---|
9 | * All or some portions of this file are derived from material licensed |
---|
10 | * to the University of California by American Telephone and Telegraph |
---|
11 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
---|
12 | * the permission of UNIX System Laboratories, Inc. |
---|
13 | * |
---|
14 | * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org> |
---|
15 | * Copyright (c) 2013, 2014 The FreeBSD Foundation |
---|
16 | * |
---|
17 | * Portions of this software were developed by Konstantin Belousov |
---|
18 | * under sponsorship from the FreeBSD Foundation. |
---|
19 | * |
---|
20 | * Redistribution and use in source and binary forms, with or without |
---|
21 | * modification, are permitted provided that the following conditions |
---|
22 | * are met: |
---|
23 | * 1. Redistributions of source code must retain the above copyright |
---|
24 | * notice, this list of conditions and the following disclaimer. |
---|
25 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
26 | * notice, this list of conditions and the following disclaimer in the |
---|
27 | * documentation and/or other materials provided with the distribution. |
---|
28 | * 3. Neither the name of the University nor the names of its contributors |
---|
29 | * may be used to endorse or promote products derived from this software |
---|
30 | * without specific prior written permission. |
---|
31 | * |
---|
32 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
---|
33 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
34 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
35 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
---|
36 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
37 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
38 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
39 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
40 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
41 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
42 | * SUCH DAMAGE. |
---|
43 | * |
---|
44 | * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 |
---|
45 | */ |
---|
46 | |
---|
47 | #include <sys/cdefs.h> |
---|
48 | __FBSDID("$FreeBSD$"); |
---|
49 | |
---|
50 | #include <rtems/bsd/local/opt_hwpmc_hooks.h> |
---|
51 | |
---|
52 | #include <sys/param.h> |
---|
53 | #include <sys/systm.h> |
---|
54 | #include <sys/disk.h> |
---|
55 | #include <sys/fail.h> |
---|
56 | #include <sys/fcntl.h> |
---|
57 | #include <sys/file.h> |
---|
58 | #include <sys/kdb.h> |
---|
59 | #include <sys/stat.h> |
---|
60 | #include <sys/priv.h> |
---|
61 | #include <sys/proc.h> |
---|
62 | #include <sys/limits.h> |
---|
63 | #include <sys/lock.h> |
---|
64 | #include <sys/mman.h> |
---|
65 | #include <sys/mount.h> |
---|
66 | #include <sys/mutex.h> |
---|
67 | #include <sys/namei.h> |
---|
68 | #include <sys/vnode.h> |
---|
69 | #include <sys/bio.h> |
---|
70 | #include <sys/buf.h> |
---|
71 | #include <sys/filio.h> |
---|
72 | #include <sys/resourcevar.h> |
---|
73 | #include <sys/rwlock.h> |
---|
74 | #include <sys/sx.h> |
---|
75 | #include <sys/sysctl.h> |
---|
76 | #include <sys/ttycom.h> |
---|
77 | #include <sys/conf.h> |
---|
78 | #include <sys/syslog.h> |
---|
79 | #include <rtems/bsd/sys/unistd.h> |
---|
80 | #include <sys/user.h> |
---|
81 | |
---|
82 | #include <security/audit/audit.h> |
---|
83 | #include <security/mac/mac_framework.h> |
---|
84 | |
---|
85 | #include <vm/vm.h> |
---|
86 | #include <vm/vm_extern.h> |
---|
87 | #include <vm/pmap.h> |
---|
88 | #include <vm/vm_map.h> |
---|
89 | #include <vm/vm_object.h> |
---|
90 | #include <vm/vm_page.h> |
---|
91 | #include <vm/vm_pager.h> |
---|
92 | |
---|
93 | #ifdef HWPMC_HOOKS |
---|
94 | #include <sys/pmckern.h> |
---|
95 | #endif |
---|
96 | |
---|
97 | static fo_rdwr_t vn_read; |
---|
98 | static fo_rdwr_t vn_write; |
---|
99 | static fo_rdwr_t vn_io_fault; |
---|
100 | static fo_truncate_t vn_truncate; |
---|
101 | static fo_ioctl_t vn_ioctl; |
---|
102 | static fo_poll_t vn_poll; |
---|
103 | static fo_kqfilter_t vn_kqfilter; |
---|
104 | static fo_stat_t vn_statfile; |
---|
105 | static fo_close_t vn_closefile; |
---|
106 | static fo_mmap_t vn_mmap; |
---|
107 | |
---|
108 | struct fileops vnops = { |
---|
109 | .fo_read = vn_io_fault, |
---|
110 | .fo_write = vn_io_fault, |
---|
111 | .fo_truncate = vn_truncate, |
---|
112 | .fo_ioctl = vn_ioctl, |
---|
113 | .fo_poll = vn_poll, |
---|
114 | .fo_kqfilter = vn_kqfilter, |
---|
115 | .fo_stat = vn_statfile, |
---|
116 | .fo_close = vn_closefile, |
---|
117 | .fo_chmod = vn_chmod, |
---|
118 | .fo_chown = vn_chown, |
---|
119 | #ifndef __rtems__ |
---|
120 | .fo_sendfile = vn_sendfile, |
---|
121 | #endif /* __rtems__ */ |
---|
122 | .fo_seek = vn_seek, |
---|
123 | #ifndef __rtems__ |
---|
124 | .fo_fill_kinfo = vn_fill_kinfo, |
---|
125 | .fo_mmap = vn_mmap, |
---|
126 | #endif /* __rtems__ */ |
---|
127 | .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE |
---|
128 | }; |
---|
129 | |
---|
130 | static const int io_hold_cnt = 16; |
---|
131 | static int vn_io_fault_enable = 1; |
---|
132 | SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, |
---|
133 | &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); |
---|
134 | static int vn_io_fault_prefault = 0; |
---|
135 | SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, |
---|
136 | &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); |
---|
137 | static u_long vn_io_faults_cnt; |
---|
138 | SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, |
---|
139 | &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); |
---|
140 | |
---|
141 | /* |
---|
142 | * Returns true if vn_io_fault mode of handling the i/o request should |
---|
143 | * be used. |
---|
144 | */ |
---|
145 | static bool |
---|
146 | do_vn_io_fault(struct vnode *vp, struct uio *uio) |
---|
147 | { |
---|
148 | struct mount *mp; |
---|
149 | |
---|
150 | return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && |
---|
151 | (mp = vp->v_mount) != NULL && |
---|
152 | (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); |
---|
153 | } |
---|
154 | |
---|
155 | /* |
---|
156 | * Structure used to pass arguments to vn_io_fault1(), to do either |
---|
157 | * file- or vnode-based I/O calls. |
---|
158 | */ |
---|
159 | struct vn_io_fault_args { |
---|
160 | enum { |
---|
161 | VN_IO_FAULT_FOP, |
---|
162 | VN_IO_FAULT_VOP |
---|
163 | } kind; |
---|
164 | struct ucred *cred; |
---|
165 | int flags; |
---|
166 | union { |
---|
167 | struct fop_args_tag { |
---|
168 | struct file *fp; |
---|
169 | fo_rdwr_t *doio; |
---|
170 | } fop_args; |
---|
171 | struct vop_args_tag { |
---|
172 | struct vnode *vp; |
---|
173 | } vop_args; |
---|
174 | } args; |
---|
175 | }; |
---|
176 | |
---|
177 | static int vn_io_fault1(struct vnode *vp, struct uio *uio, |
---|
178 | struct vn_io_fault_args *args, struct thread *td); |
---|
179 | |
---|
180 | int |
---|
181 | vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) |
---|
182 | { |
---|
183 | struct thread *td = ndp->ni_cnd.cn_thread; |
---|
184 | |
---|
185 | return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); |
---|
186 | } |
---|
187 | |
---|
188 | /* |
---|
189 | * Common code for vnode open operations via a name lookup. |
---|
190 | * Lookup the vnode and invoke VOP_CREATE if needed. |
---|
191 | * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. |
---|
192 | * |
---|
193 | * Note that this does NOT free nameidata for the successful case, |
---|
194 | * due to the NDINIT being done elsewhere. |
---|
195 | */ |
---|
196 | int |
---|
197 | vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, |
---|
198 | struct ucred *cred, struct file *fp) |
---|
199 | { |
---|
200 | struct vnode *vp; |
---|
201 | struct mount *mp; |
---|
202 | struct thread *td = ndp->ni_cnd.cn_thread; |
---|
203 | struct vattr vat; |
---|
204 | struct vattr *vap = &vat; |
---|
205 | int fmode, error; |
---|
206 | |
---|
207 | restart: |
---|
208 | fmode = *flagp; |
---|
209 | if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | |
---|
210 | O_EXCL | O_DIRECTORY)) |
---|
211 | return (EINVAL); |
---|
212 | else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { |
---|
213 | ndp->ni_cnd.cn_nameiop = CREATE; |
---|
214 | /* |
---|
215 | * Set NOCACHE to avoid flushing the cache when |
---|
216 | * rolling in many files at once. |
---|
217 | */ |
---|
218 | ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; |
---|
219 | if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) |
---|
220 | ndp->ni_cnd.cn_flags |= FOLLOW; |
---|
221 | if (!(vn_open_flags & VN_OPEN_NOAUDIT)) |
---|
222 | ndp->ni_cnd.cn_flags |= AUDITVNODE1; |
---|
223 | if (vn_open_flags & VN_OPEN_NOCAPCHECK) |
---|
224 | ndp->ni_cnd.cn_flags |= NOCAPCHECK; |
---|
225 | if ((vn_open_flags & VN_OPEN_INVFS) == 0) |
---|
226 | bwillwrite(); |
---|
227 | if ((error = namei(ndp)) != 0) |
---|
228 | return (error); |
---|
229 | if (ndp->ni_vp == NULL) { |
---|
230 | VATTR_NULL(vap); |
---|
231 | vap->va_type = VREG; |
---|
232 | vap->va_mode = cmode; |
---|
233 | if (fmode & O_EXCL) |
---|
234 | vap->va_vaflags |= VA_EXCLUSIVE; |
---|
235 | if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { |
---|
236 | NDFREE(ndp, NDF_ONLY_PNBUF); |
---|
237 | vput(ndp->ni_dvp); |
---|
238 | if ((error = vn_start_write(NULL, &mp, |
---|
239 | V_XSLEEP | PCATCH)) != 0) |
---|
240 | return (error); |
---|
241 | goto restart; |
---|
242 | } |
---|
243 | if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) |
---|
244 | ndp->ni_cnd.cn_flags |= MAKEENTRY; |
---|
245 | #ifdef MAC |
---|
246 | error = mac_vnode_check_create(cred, ndp->ni_dvp, |
---|
247 | &ndp->ni_cnd, vap); |
---|
248 | if (error == 0) |
---|
249 | #endif |
---|
250 | error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, |
---|
251 | &ndp->ni_cnd, vap); |
---|
252 | vput(ndp->ni_dvp); |
---|
253 | vn_finished_write(mp); |
---|
254 | if (error) { |
---|
255 | NDFREE(ndp, NDF_ONLY_PNBUF); |
---|
256 | return (error); |
---|
257 | } |
---|
258 | fmode &= ~O_TRUNC; |
---|
259 | vp = ndp->ni_vp; |
---|
260 | } else { |
---|
261 | if (ndp->ni_dvp == ndp->ni_vp) |
---|
262 | vrele(ndp->ni_dvp); |
---|
263 | else |
---|
264 | vput(ndp->ni_dvp); |
---|
265 | ndp->ni_dvp = NULL; |
---|
266 | vp = ndp->ni_vp; |
---|
267 | if (fmode & O_EXCL) { |
---|
268 | error = EEXIST; |
---|
269 | goto bad; |
---|
270 | } |
---|
271 | if (vp->v_type == VDIR) { |
---|
272 | error = EISDIR; |
---|
273 | goto bad; |
---|
274 | } |
---|
275 | fmode &= ~O_CREAT; |
---|
276 | } |
---|
277 | } else { |
---|
278 | ndp->ni_cnd.cn_nameiop = LOOKUP; |
---|
279 | ndp->ni_cnd.cn_flags = ISOPEN | |
---|
280 | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; |
---|
281 | if (!(fmode & FWRITE)) |
---|
282 | ndp->ni_cnd.cn_flags |= LOCKSHARED; |
---|
283 | if (!(vn_open_flags & VN_OPEN_NOAUDIT)) |
---|
284 | ndp->ni_cnd.cn_flags |= AUDITVNODE1; |
---|
285 | if (vn_open_flags & VN_OPEN_NOCAPCHECK) |
---|
286 | ndp->ni_cnd.cn_flags |= NOCAPCHECK; |
---|
287 | if ((error = namei(ndp)) != 0) |
---|
288 | return (error); |
---|
289 | vp = ndp->ni_vp; |
---|
290 | } |
---|
291 | error = vn_open_vnode(vp, fmode, cred, td, fp); |
---|
292 | if (error) |
---|
293 | goto bad; |
---|
294 | *flagp = fmode; |
---|
295 | return (0); |
---|
296 | bad: |
---|
297 | NDFREE(ndp, NDF_ONLY_PNBUF); |
---|
298 | vput(vp); |
---|
299 | *flagp = fmode; |
---|
300 | ndp->ni_vp = NULL; |
---|
301 | return (error); |
---|
302 | } |
---|
303 | |
---|
304 | static int |
---|
305 | vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) |
---|
306 | { |
---|
307 | struct flock lf; |
---|
308 | int error, lock_flags, type; |
---|
309 | |
---|
310 | ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); |
---|
311 | if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) |
---|
312 | return (0); |
---|
313 | KASSERT(fp != NULL, ("open with flock requires fp")); |
---|
314 | if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) |
---|
315 | return (EOPNOTSUPP); |
---|
316 | |
---|
317 | lock_flags = VOP_ISLOCKED(vp); |
---|
318 | VOP_UNLOCK(vp, 0); |
---|
319 | |
---|
320 | lf.l_whence = SEEK_SET; |
---|
321 | lf.l_start = 0; |
---|
322 | lf.l_len = 0; |
---|
323 | lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; |
---|
324 | type = F_FLOCK; |
---|
325 | if ((fmode & FNONBLOCK) == 0) |
---|
326 | type |= F_WAIT; |
---|
327 | error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); |
---|
328 | if (error == 0) |
---|
329 | fp->f_flag |= FHASLOCK; |
---|
330 | |
---|
331 | vn_lock(vp, lock_flags | LK_RETRY); |
---|
332 | if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) |
---|
333 | error = ENOENT; |
---|
334 | return (error); |
---|
335 | } |
---|
336 | |
---|
337 | /* |
---|
338 | * Common code for vnode open operations once a vnode is located. |
---|
339 | * Check permissions, and call the VOP_OPEN routine. |
---|
340 | */ |
---|
341 | int |
---|
342 | vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, |
---|
343 | struct thread *td, struct file *fp) |
---|
344 | { |
---|
345 | accmode_t accmode; |
---|
346 | int error; |
---|
347 | |
---|
348 | if (vp->v_type == VLNK) |
---|
349 | return (EMLINK); |
---|
350 | if (vp->v_type == VSOCK) |
---|
351 | return (EOPNOTSUPP); |
---|
352 | if (vp->v_type != VDIR && fmode & O_DIRECTORY) |
---|
353 | return (ENOTDIR); |
---|
354 | accmode = 0; |
---|
355 | if (fmode & (FWRITE | O_TRUNC)) { |
---|
356 | if (vp->v_type == VDIR) |
---|
357 | return (EISDIR); |
---|
358 | accmode |= VWRITE; |
---|
359 | } |
---|
360 | if (fmode & FREAD) |
---|
361 | accmode |= VREAD; |
---|
362 | #ifndef __rtems__ |
---|
363 | if (fmode & FEXEC) |
---|
364 | accmode |= VEXEC; |
---|
365 | #endif /* __rtems__ */ |
---|
366 | if ((fmode & O_APPEND) && (fmode & FWRITE)) |
---|
367 | accmode |= VAPPEND; |
---|
368 | #ifdef MAC |
---|
369 | if (fmode & O_CREAT) |
---|
370 | accmode |= VCREAT; |
---|
371 | if (fmode & O_VERIFY) |
---|
372 | accmode |= VVERIFY; |
---|
373 | error = mac_vnode_check_open(cred, vp, accmode); |
---|
374 | if (error) |
---|
375 | return (error); |
---|
376 | |
---|
377 | accmode &= ~(VCREAT | VVERIFY); |
---|
378 | #endif |
---|
379 | if ((fmode & O_CREAT) == 0 && accmode != 0) { |
---|
380 | error = VOP_ACCESS(vp, accmode, cred, td); |
---|
381 | if (error != 0) |
---|
382 | return (error); |
---|
383 | } |
---|
384 | if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) |
---|
385 | vn_lock(vp, LK_UPGRADE | LK_RETRY); |
---|
386 | error = VOP_OPEN(vp, fmode, cred, td, fp); |
---|
387 | if (error != 0) |
---|
388 | return (error); |
---|
389 | |
---|
390 | error = vn_open_vnode_advlock(vp, fmode, fp); |
---|
391 | if (error == 0 && (fmode & FWRITE) != 0) { |
---|
392 | error = VOP_ADD_WRITECOUNT(vp, 1); |
---|
393 | if (error == 0) { |
---|
394 | CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", |
---|
395 | __func__, vp, vp->v_writecount); |
---|
396 | } |
---|
397 | } |
---|
398 | |
---|
399 | /* |
---|
400 | * Error from advlock or VOP_ADD_WRITECOUNT() still requires |
---|
401 | * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). |
---|
402 | * Arrange for that by having fdrop() to use vn_closefile(). |
---|
403 | */ |
---|
404 | if (error != 0) { |
---|
405 | #ifndef __rtems__ |
---|
406 | fp->f_flag |= FOPENFAILED; |
---|
407 | #endif /* __rtems__ */ |
---|
408 | fp->f_vnode = vp; |
---|
409 | if (fp->f_ops == &badfileops) { |
---|
410 | fp->f_type = DTYPE_VNODE; |
---|
411 | fp->f_ops = &vnops; |
---|
412 | } |
---|
413 | vref(vp); |
---|
414 | } |
---|
415 | |
---|
416 | ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); |
---|
417 | return (error); |
---|
418 | |
---|
419 | } |
---|
420 | |
---|
421 | /* |
---|
422 | * Check for write permissions on the specified vnode. |
---|
423 | * Prototype text segments cannot be written. |
---|
424 | * It is racy. |
---|
425 | */ |
---|
426 | int |
---|
427 | vn_writechk(struct vnode *vp) |
---|
428 | { |
---|
429 | |
---|
430 | ASSERT_VOP_LOCKED(vp, "vn_writechk"); |
---|
431 | /* |
---|
432 | * If there's shared text associated with |
---|
433 | * the vnode, try to free it up once. If |
---|
434 | * we fail, we can't allow writing. |
---|
435 | */ |
---|
436 | if (VOP_IS_TEXT(vp)) |
---|
437 | return (ETXTBSY); |
---|
438 | |
---|
439 | return (0); |
---|
440 | } |
---|
441 | |
---|
442 | /* |
---|
443 | * Vnode close call |
---|
444 | */ |
---|
445 | static int |
---|
446 | vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, |
---|
447 | struct thread *td, bool keep_ref) |
---|
448 | { |
---|
449 | struct mount *mp; |
---|
450 | int error, lock_flags; |
---|
451 | |
---|
452 | if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && |
---|
453 | MNT_EXTENDED_SHARED(vp->v_mount)) |
---|
454 | lock_flags = LK_SHARED; |
---|
455 | else |
---|
456 | lock_flags = LK_EXCLUSIVE; |
---|
457 | |
---|
458 | vn_start_write(vp, &mp, V_WAIT); |
---|
459 | vn_lock(vp, lock_flags | LK_RETRY); |
---|
460 | AUDIT_ARG_VNODE1(vp); |
---|
461 | #ifndef __rtems__ |
---|
462 | if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { |
---|
463 | #else /* __rtems__ */ |
---|
464 | if ((flags & FWRITE) == FWRITE) { |
---|
465 | #endif /* __rtems__ */ |
---|
466 | VOP_ADD_WRITECOUNT_CHECKED(vp, -1); |
---|
467 | CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", |
---|
468 | __func__, vp, vp->v_writecount); |
---|
469 | } |
---|
470 | error = VOP_CLOSE(vp, flags, file_cred, td); |
---|
471 | if (keep_ref) |
---|
472 | VOP_UNLOCK(vp, 0); |
---|
473 | else |
---|
474 | vput(vp); |
---|
475 | vn_finished_write(mp); |
---|
476 | return (error); |
---|
477 | } |
---|
478 | |
---|
479 | int |
---|
480 | vn_close(struct vnode *vp, int flags, struct ucred *file_cred, |
---|
481 | struct thread *td) |
---|
482 | { |
---|
483 | |
---|
484 | return (vn_close1(vp, flags, file_cred, td, false)); |
---|
485 | } |
---|
486 | |
---|
487 | /* |
---|
488 | * Heuristic to detect sequential operation. |
---|
489 | */ |
---|
490 | static int |
---|
491 | sequential_heuristic(struct uio *uio, struct file *fp) |
---|
492 | { |
---|
493 | |
---|
494 | ASSERT_VOP_LOCKED(fp->f_vnode, __func__); |
---|
495 | #ifndef __rtems__ |
---|
496 | if (fp->f_flag & FRDAHEAD) |
---|
497 | return (fp->f_seqcount << IO_SEQSHIFT); |
---|
498 | #endif /* __rtems__ */ |
---|
499 | |
---|
500 | /* |
---|
501 | * Offset 0 is handled specially. open() sets f_seqcount to 1 so |
---|
502 | * that the first I/O is normally considered to be slightly |
---|
503 | * sequential. Seeking to offset 0 doesn't change sequentiality |
---|
504 | * unless previous seeks have reduced f_seqcount to 0, in which |
---|
505 | * case offset 0 is not special. |
---|
506 | */ |
---|
507 | if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || |
---|
508 | uio->uio_offset == fp->f_nextoff) { |
---|
509 | /* |
---|
510 | * f_seqcount is in units of fixed-size blocks so that it |
---|
511 | * depends mainly on the amount of sequential I/O and not |
---|
512 | * much on the number of sequential I/O's. The fixed size |
---|
513 | * of 16384 is hard-coded here since it is (not quite) just |
---|
514 | * a magic size that works well here. This size is more |
---|
515 | * closely related to the best I/O size for real disks than |
---|
516 | * to any block size used by software. |
---|
517 | */ |
---|
518 | if (uio->uio_resid >= IO_SEQMAX * 16384) |
---|
519 | fp->f_seqcount = IO_SEQMAX; |
---|
520 | else { |
---|
521 | fp->f_seqcount += howmany(uio->uio_resid, 16384); |
---|
522 | if (fp->f_seqcount > IO_SEQMAX) |
---|
523 | fp->f_seqcount = IO_SEQMAX; |
---|
524 | } |
---|
525 | return (fp->f_seqcount << IO_SEQSHIFT); |
---|
526 | } |
---|
527 | |
---|
528 | /* Not sequential. Quickly draw-down sequentiality. */ |
---|
529 | if (fp->f_seqcount > 1) |
---|
530 | fp->f_seqcount = 1; |
---|
531 | else |
---|
532 | fp->f_seqcount = 0; |
---|
533 | return (0); |
---|
534 | } |
---|
535 | |
---|
536 | /* |
---|
537 | * Package up an I/O request on a vnode into a uio and do it. |
---|
538 | */ |
---|
539 | int |
---|
540 | vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, |
---|
541 | enum uio_seg segflg, int ioflg, struct ucred *active_cred, |
---|
542 | struct ucred *file_cred, ssize_t *aresid, struct thread *td) |
---|
543 | { |
---|
544 | struct uio auio; |
---|
545 | struct iovec aiov; |
---|
546 | struct mount *mp; |
---|
547 | struct ucred *cred; |
---|
548 | void *rl_cookie; |
---|
549 | struct vn_io_fault_args args; |
---|
550 | int error, lock_flags; |
---|
551 | |
---|
552 | if (offset < 0 && vp->v_type != VCHR) |
---|
553 | return (EINVAL); |
---|
554 | auio.uio_iov = &aiov; |
---|
555 | auio.uio_iovcnt = 1; |
---|
556 | aiov.iov_base = base; |
---|
557 | aiov.iov_len = len; |
---|
558 | auio.uio_resid = len; |
---|
559 | auio.uio_offset = offset; |
---|
560 | auio.uio_segflg = segflg; |
---|
561 | auio.uio_rw = rw; |
---|
562 | auio.uio_td = td; |
---|
563 | error = 0; |
---|
564 | |
---|
565 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
566 | if ((ioflg & IO_RANGELOCKED) == 0) { |
---|
567 | if (rw == UIO_READ) { |
---|
568 | rl_cookie = vn_rangelock_rlock(vp, offset, |
---|
569 | offset + len); |
---|
570 | } else { |
---|
571 | rl_cookie = vn_rangelock_wlock(vp, offset, |
---|
572 | offset + len); |
---|
573 | } |
---|
574 | } else |
---|
575 | rl_cookie = NULL; |
---|
576 | mp = NULL; |
---|
577 | if (rw == UIO_WRITE) { |
---|
578 | if (vp->v_type != VCHR && |
---|
579 | (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) |
---|
580 | != 0) |
---|
581 | goto out; |
---|
582 | if (MNT_SHARED_WRITES(mp) || |
---|
583 | ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) |
---|
584 | lock_flags = LK_SHARED; |
---|
585 | else |
---|
586 | lock_flags = LK_EXCLUSIVE; |
---|
587 | } else |
---|
588 | lock_flags = LK_SHARED; |
---|
589 | vn_lock(vp, lock_flags | LK_RETRY); |
---|
590 | } else |
---|
591 | rl_cookie = NULL; |
---|
592 | |
---|
593 | ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); |
---|
594 | #ifdef MAC |
---|
595 | if ((ioflg & IO_NOMACCHECK) == 0) { |
---|
596 | if (rw == UIO_READ) |
---|
597 | error = mac_vnode_check_read(active_cred, file_cred, |
---|
598 | vp); |
---|
599 | else |
---|
600 | error = mac_vnode_check_write(active_cred, file_cred, |
---|
601 | vp); |
---|
602 | } |
---|
603 | #endif |
---|
604 | if (error == 0) { |
---|
605 | if (file_cred != NULL) |
---|
606 | cred = file_cred; |
---|
607 | else |
---|
608 | cred = active_cred; |
---|
609 | if (do_vn_io_fault(vp, &auio)) { |
---|
610 | args.kind = VN_IO_FAULT_VOP; |
---|
611 | args.cred = cred; |
---|
612 | args.flags = ioflg; |
---|
613 | args.args.vop_args.vp = vp; |
---|
614 | error = vn_io_fault1(vp, &auio, &args, td); |
---|
615 | } else if (rw == UIO_READ) { |
---|
616 | error = VOP_READ(vp, &auio, ioflg, cred); |
---|
617 | } else /* if (rw == UIO_WRITE) */ { |
---|
618 | error = VOP_WRITE(vp, &auio, ioflg, cred); |
---|
619 | } |
---|
620 | } |
---|
621 | if (aresid) |
---|
622 | *aresid = auio.uio_resid; |
---|
623 | else |
---|
624 | if (auio.uio_resid && error == 0) |
---|
625 | error = EIO; |
---|
626 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
627 | VOP_UNLOCK(vp, 0); |
---|
628 | if (mp != NULL) |
---|
629 | vn_finished_write(mp); |
---|
630 | } |
---|
631 | out: |
---|
632 | if (rl_cookie != NULL) |
---|
633 | vn_rangelock_unlock(vp, rl_cookie); |
---|
634 | return (error); |
---|
635 | } |
---|
636 | |
---|
637 | /* |
---|
638 | * Package up an I/O request on a vnode into a uio and do it. The I/O |
---|
639 | * request is split up into smaller chunks and we try to avoid saturating |
---|
640 | * the buffer cache while potentially holding a vnode locked, so we |
---|
641 | * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() |
---|
642 | * to give other processes a chance to lock the vnode (either other processes |
---|
643 | * core'ing the same binary, or unrelated processes scanning the directory). |
---|
644 | */ |
---|
645 | int |
---|
646 | vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, |
---|
647 | off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, |
---|
648 | struct ucred *file_cred, size_t *aresid, struct thread *td) |
---|
649 | { |
---|
650 | int error = 0; |
---|
651 | ssize_t iaresid; |
---|
652 | |
---|
653 | do { |
---|
654 | int chunk; |
---|
655 | |
---|
656 | /* |
---|
657 | * Force `offset' to a multiple of MAXBSIZE except possibly |
---|
658 | * for the first chunk, so that filesystems only need to |
---|
659 | * write full blocks except possibly for the first and last |
---|
660 | * chunks. |
---|
661 | */ |
---|
662 | chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; |
---|
663 | |
---|
664 | if (chunk > len) |
---|
665 | chunk = len; |
---|
666 | if (rw != UIO_READ && vp->v_type == VREG) |
---|
667 | bwillwrite(); |
---|
668 | iaresid = 0; |
---|
669 | error = vn_rdwr(rw, vp, base, chunk, offset, segflg, |
---|
670 | ioflg, active_cred, file_cred, &iaresid, td); |
---|
671 | len -= chunk; /* aresid calc already includes length */ |
---|
672 | if (error) |
---|
673 | break; |
---|
674 | offset += chunk; |
---|
675 | base = (char *)base + chunk; |
---|
676 | kern_yield(PRI_USER); |
---|
677 | } while (len); |
---|
678 | if (aresid) |
---|
679 | *aresid = len + iaresid; |
---|
680 | return (error); |
---|
681 | } |
---|
682 | |
---|
683 | off_t |
---|
684 | foffset_lock(struct file *fp, int flags) |
---|
685 | { |
---|
686 | struct mtx *mtxp; |
---|
687 | off_t res; |
---|
688 | |
---|
689 | KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); |
---|
690 | |
---|
691 | #if OFF_MAX <= LONG_MAX |
---|
692 | /* |
---|
693 | * Caller only wants the current f_offset value. Assume that |
---|
694 | * the long and shorter integer types reads are atomic. |
---|
695 | */ |
---|
696 | if ((flags & FOF_NOLOCK) != 0) |
---|
697 | return (fp->f_offset); |
---|
698 | #endif |
---|
699 | |
---|
700 | /* |
---|
701 | * According to McKusick the vn lock was protecting f_offset here. |
---|
702 | * It is now protected by the FOFFSET_LOCKED flag. |
---|
703 | */ |
---|
704 | mtxp = mtx_pool_find(mtxpool_sleep, fp); |
---|
705 | mtx_lock(mtxp); |
---|
706 | if ((flags & FOF_NOLOCK) == 0) { |
---|
707 | while (fp->f_vnread_flags & FOFFSET_LOCKED) { |
---|
708 | fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; |
---|
709 | msleep(&fp->f_vnread_flags, mtxp, PUSER -1, |
---|
710 | "vofflock", 0); |
---|
711 | } |
---|
712 | fp->f_vnread_flags |= FOFFSET_LOCKED; |
---|
713 | } |
---|
714 | res = fp->f_offset; |
---|
715 | mtx_unlock(mtxp); |
---|
716 | return (res); |
---|
717 | } |
---|
718 | |
---|
719 | void |
---|
720 | foffset_unlock(struct file *fp, off_t val, int flags) |
---|
721 | { |
---|
722 | struct mtx *mtxp; |
---|
723 | |
---|
724 | KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); |
---|
725 | |
---|
726 | #if OFF_MAX <= LONG_MAX |
---|
727 | if ((flags & FOF_NOLOCK) != 0) { |
---|
728 | if ((flags & FOF_NOUPDATE) == 0) |
---|
729 | fp->f_offset = val; |
---|
730 | if ((flags & FOF_NEXTOFF) != 0) |
---|
731 | fp->f_nextoff = val; |
---|
732 | return; |
---|
733 | } |
---|
734 | #endif |
---|
735 | |
---|
736 | mtxp = mtx_pool_find(mtxpool_sleep, fp); |
---|
737 | mtx_lock(mtxp); |
---|
738 | if ((flags & FOF_NOUPDATE) == 0) |
---|
739 | fp->f_offset = val; |
---|
740 | if ((flags & FOF_NEXTOFF) != 0) |
---|
741 | fp->f_nextoff = val; |
---|
742 | if ((flags & FOF_NOLOCK) == 0) { |
---|
743 | KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, |
---|
744 | ("Lost FOFFSET_LOCKED")); |
---|
745 | if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) |
---|
746 | wakeup(&fp->f_vnread_flags); |
---|
747 | fp->f_vnread_flags = 0; |
---|
748 | } |
---|
749 | mtx_unlock(mtxp); |
---|
750 | } |
---|
751 | |
---|
752 | void |
---|
753 | foffset_lock_uio(struct file *fp, struct uio *uio, int flags) |
---|
754 | { |
---|
755 | |
---|
756 | if ((flags & FOF_OFFSET) == 0) |
---|
757 | uio->uio_offset = foffset_lock(fp, flags); |
---|
758 | } |
---|
759 | |
---|
760 | void |
---|
761 | foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) |
---|
762 | { |
---|
763 | |
---|
764 | if ((flags & FOF_OFFSET) == 0) |
---|
765 | foffset_unlock(fp, uio->uio_offset, flags); |
---|
766 | } |
---|
767 | |
---|
768 | static int |
---|
769 | get_advice(struct file *fp, struct uio *uio) |
---|
770 | { |
---|
771 | #ifndef __rtems__ |
---|
772 | struct mtx *mtxp; |
---|
773 | int ret; |
---|
774 | |
---|
775 | ret = POSIX_FADV_NORMAL; |
---|
776 | if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) |
---|
777 | return (ret); |
---|
778 | |
---|
779 | mtxp = mtx_pool_find(mtxpool_sleep, fp); |
---|
780 | mtx_lock(mtxp); |
---|
781 | if (fp->f_advice != NULL && |
---|
782 | uio->uio_offset >= fp->f_advice->fa_start && |
---|
783 | uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) |
---|
784 | ret = fp->f_advice->fa_advice; |
---|
785 | mtx_unlock(mtxp); |
---|
786 | return (ret); |
---|
787 | #else /* __rtems__ */ |
---|
788 | return (POSIX_FADV_NORMAL); |
---|
789 | #endif /* __rtems__ */ |
---|
790 | } |
---|
791 | |
---|
792 | /* |
---|
793 | * File table vnode read routine. |
---|
794 | */ |
---|
795 | static int |
---|
796 | vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, |
---|
797 | struct thread *td) |
---|
798 | { |
---|
799 | struct vnode *vp; |
---|
800 | off_t orig_offset; |
---|
801 | int error, ioflag; |
---|
802 | int advice; |
---|
803 | |
---|
804 | KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", |
---|
805 | uio->uio_td, td)); |
---|
806 | KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); |
---|
807 | vp = fp->f_vnode; |
---|
808 | ioflag = 0; |
---|
809 | if (fp->f_flag & FNONBLOCK) |
---|
810 | ioflag |= IO_NDELAY; |
---|
811 | if (fp->f_flag & O_DIRECT) |
---|
812 | ioflag |= IO_DIRECT; |
---|
813 | advice = get_advice(fp, uio); |
---|
814 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
815 | |
---|
816 | switch (advice) { |
---|
817 | case POSIX_FADV_NORMAL: |
---|
818 | case POSIX_FADV_SEQUENTIAL: |
---|
819 | case POSIX_FADV_NOREUSE: |
---|
820 | ioflag |= sequential_heuristic(uio, fp); |
---|
821 | break; |
---|
822 | case POSIX_FADV_RANDOM: |
---|
823 | /* Disable read-ahead for random I/O. */ |
---|
824 | break; |
---|
825 | } |
---|
826 | orig_offset = uio->uio_offset; |
---|
827 | |
---|
828 | #ifdef MAC |
---|
829 | error = mac_vnode_check_read(active_cred, fp->f_cred, vp); |
---|
830 | if (error == 0) |
---|
831 | #endif |
---|
832 | error = VOP_READ(vp, uio, ioflag, fp->f_cred); |
---|
833 | fp->f_nextoff = uio->uio_offset; |
---|
834 | VOP_UNLOCK(vp, 0); |
---|
835 | if (error == 0 && advice == POSIX_FADV_NOREUSE && |
---|
836 | orig_offset != uio->uio_offset) |
---|
837 | /* |
---|
838 | * Use POSIX_FADV_DONTNEED to flush pages and buffers |
---|
839 | * for the backing file after a POSIX_FADV_NOREUSE |
---|
840 | * read(2). |
---|
841 | */ |
---|
842 | error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, |
---|
843 | POSIX_FADV_DONTNEED); |
---|
844 | return (error); |
---|
845 | } |
---|
846 | |
---|
847 | /* |
---|
848 | * File table vnode write routine. |
---|
849 | */ |
---|
850 | static int |
---|
851 | vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, |
---|
852 | struct thread *td) |
---|
853 | { |
---|
854 | struct vnode *vp; |
---|
855 | struct mount *mp; |
---|
856 | off_t orig_offset; |
---|
857 | int error, ioflag, lock_flags; |
---|
858 | int advice; |
---|
859 | |
---|
860 | KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", |
---|
861 | uio->uio_td, td)); |
---|
862 | KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); |
---|
863 | vp = fp->f_vnode; |
---|
864 | if (vp->v_type == VREG) |
---|
865 | bwillwrite(); |
---|
866 | ioflag = IO_UNIT; |
---|
867 | if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) |
---|
868 | ioflag |= IO_APPEND; |
---|
869 | if (fp->f_flag & FNONBLOCK) |
---|
870 | ioflag |= IO_NDELAY; |
---|
871 | if (fp->f_flag & O_DIRECT) |
---|
872 | ioflag |= IO_DIRECT; |
---|
873 | if ((fp->f_flag & O_FSYNC) || |
---|
874 | (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) |
---|
875 | ioflag |= IO_SYNC; |
---|
876 | mp = NULL; |
---|
877 | if (vp->v_type != VCHR && |
---|
878 | (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) |
---|
879 | goto unlock; |
---|
880 | |
---|
881 | advice = get_advice(fp, uio); |
---|
882 | |
---|
883 | if (MNT_SHARED_WRITES(mp) || |
---|
884 | (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { |
---|
885 | lock_flags = LK_SHARED; |
---|
886 | } else { |
---|
887 | lock_flags = LK_EXCLUSIVE; |
---|
888 | } |
---|
889 | |
---|
890 | vn_lock(vp, lock_flags | LK_RETRY); |
---|
891 | switch (advice) { |
---|
892 | case POSIX_FADV_NORMAL: |
---|
893 | case POSIX_FADV_SEQUENTIAL: |
---|
894 | case POSIX_FADV_NOREUSE: |
---|
895 | ioflag |= sequential_heuristic(uio, fp); |
---|
896 | break; |
---|
897 | case POSIX_FADV_RANDOM: |
---|
898 | /* XXX: Is this correct? */ |
---|
899 | break; |
---|
900 | } |
---|
901 | orig_offset = uio->uio_offset; |
---|
902 | |
---|
903 | #ifdef MAC |
---|
904 | error = mac_vnode_check_write(active_cred, fp->f_cred, vp); |
---|
905 | if (error == 0) |
---|
906 | #endif |
---|
907 | error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); |
---|
908 | fp->f_nextoff = uio->uio_offset; |
---|
909 | VOP_UNLOCK(vp, 0); |
---|
910 | if (vp->v_type != VCHR) |
---|
911 | vn_finished_write(mp); |
---|
912 | if (error == 0 && advice == POSIX_FADV_NOREUSE && |
---|
913 | orig_offset != uio->uio_offset) |
---|
914 | /* |
---|
915 | * Use POSIX_FADV_DONTNEED to flush pages and buffers |
---|
916 | * for the backing file after a POSIX_FADV_NOREUSE |
---|
917 | * write(2). |
---|
918 | */ |
---|
919 | error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, |
---|
920 | POSIX_FADV_DONTNEED); |
---|
921 | unlock: |
---|
922 | return (error); |
---|
923 | } |
---|
924 | |
---|
925 | /* |
---|
926 | * The vn_io_fault() is a wrapper around vn_read() and vn_write() to |
---|
927 | * prevent the following deadlock: |
---|
928 | * |
---|
929 | * Assume that the thread A reads from the vnode vp1 into userspace |
---|
930 | * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is |
---|
931 | * currently not resident, then system ends up with the call chain |
---|
932 | * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> |
---|
933 | * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) |
---|
934 | * which establishes lock order vp1->vn_lock, then vp2->vn_lock. |
---|
935 | * If, at the same time, thread B reads from vnode vp2 into buffer buf2 |
---|
936 | * backed by the pages of vnode vp1, and some page in buf2 is not |
---|
937 | * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. |
---|
938 | * |
---|
939 | * To prevent the lock order reversal and deadlock, vn_io_fault() does |
---|
940 | * not allow page faults to happen during VOP_READ() or VOP_WRITE(). |
---|
941 | * Instead, it first tries to do the whole range i/o with pagefaults |
---|
942 | * disabled. If all pages in the i/o buffer are resident and mapped, |
---|
943 | * VOP will succeed (ignoring the genuine filesystem errors). |
---|
944 | * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do |
---|
945 | * i/o in chunks, with all pages in the chunk prefaulted and held |
---|
946 | * using vm_fault_quick_hold_pages(). |
---|
947 | * |
---|
948 | * Filesystems using this deadlock avoidance scheme should use the |
---|
949 | * array of the held pages from uio, saved in the curthread->td_ma, |
---|
950 | * instead of doing uiomove(). A helper function |
---|
951 | * vn_io_fault_uiomove() converts uiomove request into |
---|
952 | * uiomove_fromphys() over td_ma array. |
---|
953 | * |
---|
954 | * Since vnode locks do not cover the whole i/o anymore, rangelocks |
---|
955 | * make the current i/o request atomic with respect to other i/os and |
---|
956 | * truncations. |
---|
957 | */ |
---|
958 | |
---|
959 | /* |
---|
960 | * Decode vn_io_fault_args and perform the corresponding i/o. |
---|
961 | */ |
---|
962 | static int |
---|
963 | vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, |
---|
964 | struct thread *td) |
---|
965 | { |
---|
966 | int error, save; |
---|
967 | |
---|
968 | error = 0; |
---|
969 | #ifndef __rtems__ |
---|
970 | save = vm_fault_disable_pagefaults(); |
---|
971 | #endif /* __rtems__ */ |
---|
972 | switch (args->kind) { |
---|
973 | case VN_IO_FAULT_FOP: |
---|
974 | error = (args->args.fop_args.doio)(args->args.fop_args.fp, |
---|
975 | uio, args->cred, args->flags, td); |
---|
976 | break; |
---|
977 | case VN_IO_FAULT_VOP: |
---|
978 | if (uio->uio_rw == UIO_READ) { |
---|
979 | error = VOP_READ(args->args.vop_args.vp, uio, |
---|
980 | args->flags, args->cred); |
---|
981 | } else if (uio->uio_rw == UIO_WRITE) { |
---|
982 | error = VOP_WRITE(args->args.vop_args.vp, uio, |
---|
983 | args->flags, args->cred); |
---|
984 | } |
---|
985 | break; |
---|
986 | default: |
---|
987 | panic("vn_io_fault_doio: unknown kind of io %d %d", |
---|
988 | args->kind, uio->uio_rw); |
---|
989 | } |
---|
990 | #ifndef __rtems__ |
---|
991 | vm_fault_enable_pagefaults(save); |
---|
992 | #endif /* __rtems__ */ |
---|
993 | return (error); |
---|
994 | } |
---|
995 | |
---|
996 | static int |
---|
997 | vn_io_fault_touch(char *base, const struct uio *uio) |
---|
998 | { |
---|
999 | #ifndef __rtems__ |
---|
1000 | int r; |
---|
1001 | |
---|
1002 | r = fubyte(base); |
---|
1003 | if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) |
---|
1004 | return (EFAULT); |
---|
1005 | return (0); |
---|
1006 | #else /* __rtems__ */ |
---|
1007 | return (EFAULT); |
---|
1008 | #endif /* __rtems__ */ |
---|
1009 | } |
---|
1010 | |
---|
1011 | static int |
---|
1012 | vn_io_fault_prefault_user(const struct uio *uio) |
---|
1013 | { |
---|
1014 | #ifndef __rtems__ |
---|
1015 | char *base; |
---|
1016 | const struct iovec *iov; |
---|
1017 | size_t len; |
---|
1018 | ssize_t resid; |
---|
1019 | int error, i; |
---|
1020 | |
---|
1021 | KASSERT(uio->uio_segflg == UIO_USERSPACE, |
---|
1022 | ("vn_io_fault_prefault userspace")); |
---|
1023 | |
---|
1024 | error = i = 0; |
---|
1025 | iov = uio->uio_iov; |
---|
1026 | resid = uio->uio_resid; |
---|
1027 | base = iov->iov_base; |
---|
1028 | len = iov->iov_len; |
---|
1029 | while (resid > 0) { |
---|
1030 | error = vn_io_fault_touch(base, uio); |
---|
1031 | if (error != 0) |
---|
1032 | break; |
---|
1033 | if (len < PAGE_SIZE) { |
---|
1034 | if (len != 0) { |
---|
1035 | error = vn_io_fault_touch(base + len - 1, uio); |
---|
1036 | if (error != 0) |
---|
1037 | break; |
---|
1038 | resid -= len; |
---|
1039 | } |
---|
1040 | if (++i >= uio->uio_iovcnt) |
---|
1041 | break; |
---|
1042 | iov = uio->uio_iov + i; |
---|
1043 | base = iov->iov_base; |
---|
1044 | len = iov->iov_len; |
---|
1045 | } else { |
---|
1046 | len -= PAGE_SIZE; |
---|
1047 | base += PAGE_SIZE; |
---|
1048 | resid -= PAGE_SIZE; |
---|
1049 | } |
---|
1050 | } |
---|
1051 | return (error); |
---|
1052 | #else /* __rtems__ */ |
---|
1053 | return (EFAULT); |
---|
1054 | #endif /* __rtems__ */ |
---|
1055 | } |
---|
1056 | |
---|
1057 | /* |
---|
1058 | * Common code for vn_io_fault(), agnostic to the kind of i/o request. |
---|
1059 | * Uses vn_io_fault_doio() to make the call to an actual i/o function. |
---|
1060 | * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request |
---|
1061 | * into args and call vn_io_fault1() to handle faults during the user |
---|
1062 | * mode buffer accesses. |
---|
1063 | */ |
---|
1064 | static int |
---|
1065 | vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, |
---|
1066 | struct thread *td) |
---|
1067 | { |
---|
1068 | vm_page_t ma[io_hold_cnt + 2]; |
---|
1069 | struct uio *uio_clone, short_uio; |
---|
1070 | struct iovec short_iovec[1]; |
---|
1071 | vm_page_t *prev_td_ma; |
---|
1072 | vm_prot_t prot; |
---|
1073 | vm_offset_t addr, end; |
---|
1074 | size_t len, resid; |
---|
1075 | ssize_t adv; |
---|
1076 | int error, cnt, saveheld, prev_td_ma_cnt; |
---|
1077 | #ifdef __rtems__ |
---|
1078 | struct uio uio_clone_; |
---|
1079 | #endif /* __rtems__ */ |
---|
1080 | |
---|
1081 | if (vn_io_fault_prefault) { |
---|
1082 | error = vn_io_fault_prefault_user(uio); |
---|
1083 | if (error != 0) |
---|
1084 | return (error); /* Or ignore ? */ |
---|
1085 | } |
---|
1086 | |
---|
1087 | prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; |
---|
1088 | |
---|
1089 | /* |
---|
1090 | * The UFS follows IO_UNIT directive and replays back both |
---|
1091 | * uio_offset and uio_resid if an error is encountered during the |
---|
1092 | * operation. But, since the iovec may be already advanced, |
---|
1093 | * uio is still in an inconsistent state. |
---|
1094 | * |
---|
1095 | * Cache a copy of the original uio, which is advanced to the redo |
---|
1096 | * point using UIO_NOCOPY below. |
---|
1097 | */ |
---|
1098 | #ifndef __rtems__ |
---|
1099 | uio_clone = cloneuio(uio); |
---|
1100 | #else /* __rtems__ */ |
---|
1101 | uio_clone_ = *uio; |
---|
1102 | uio_clone = &uio_clone_; |
---|
1103 | #endif /* __rtems__ */ |
---|
1104 | resid = uio->uio_resid; |
---|
1105 | |
---|
1106 | short_uio.uio_segflg = UIO_USERSPACE; |
---|
1107 | short_uio.uio_rw = uio->uio_rw; |
---|
1108 | short_uio.uio_td = uio->uio_td; |
---|
1109 | |
---|
1110 | error = vn_io_fault_doio(args, uio, td); |
---|
1111 | if (error != EFAULT) |
---|
1112 | goto out; |
---|
1113 | |
---|
1114 | atomic_add_long(&vn_io_faults_cnt, 1); |
---|
1115 | uio_clone->uio_segflg = UIO_NOCOPY; |
---|
1116 | uiomove(NULL, resid - uio->uio_resid, uio_clone); |
---|
1117 | uio_clone->uio_segflg = uio->uio_segflg; |
---|
1118 | |
---|
1119 | #ifndef __rtems__ |
---|
1120 | saveheld = curthread_pflags_set(TDP_UIOHELD); |
---|
1121 | prev_td_ma = td->td_ma; |
---|
1122 | prev_td_ma_cnt = td->td_ma_cnt; |
---|
1123 | #endif /* __rtems__ */ |
---|
1124 | |
---|
1125 | while (uio_clone->uio_resid != 0) { |
---|
1126 | len = uio_clone->uio_iov->iov_len; |
---|
1127 | if (len == 0) { |
---|
1128 | KASSERT(uio_clone->uio_iovcnt >= 1, |
---|
1129 | ("iovcnt underflow")); |
---|
1130 | uio_clone->uio_iov++; |
---|
1131 | uio_clone->uio_iovcnt--; |
---|
1132 | continue; |
---|
1133 | } |
---|
1134 | if (len > io_hold_cnt * PAGE_SIZE) |
---|
1135 | len = io_hold_cnt * PAGE_SIZE; |
---|
1136 | addr = (uintptr_t)uio_clone->uio_iov->iov_base; |
---|
1137 | end = round_page(addr + len); |
---|
1138 | if (end < addr) { |
---|
1139 | error = EFAULT; |
---|
1140 | break; |
---|
1141 | } |
---|
1142 | cnt = atop(end - trunc_page(addr)); |
---|
1143 | #ifndef __rtems__ |
---|
1144 | /* |
---|
1145 | * A perfectly misaligned address and length could cause |
---|
1146 | * both the start and the end of the chunk to use partial |
---|
1147 | * page. +2 accounts for such a situation. |
---|
1148 | */ |
---|
1149 | cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, |
---|
1150 | addr, len, prot, ma, io_hold_cnt + 2); |
---|
1151 | if (cnt == -1) { |
---|
1152 | error = EFAULT; |
---|
1153 | break; |
---|
1154 | } |
---|
1155 | #endif /* __rtems__ */ |
---|
1156 | short_uio.uio_iov = &short_iovec[0]; |
---|
1157 | short_iovec[0].iov_base = (void *)addr; |
---|
1158 | short_uio.uio_iovcnt = 1; |
---|
1159 | short_uio.uio_resid = short_iovec[0].iov_len = len; |
---|
1160 | short_uio.uio_offset = uio_clone->uio_offset; |
---|
1161 | #ifndef __rtems__ |
---|
1162 | td->td_ma = ma; |
---|
1163 | td->td_ma_cnt = cnt; |
---|
1164 | #endif /* __rtems__ */ |
---|
1165 | |
---|
1166 | error = vn_io_fault_doio(args, &short_uio, td); |
---|
1167 | #ifndef __rtems__ |
---|
1168 | vm_page_unhold_pages(ma, cnt); |
---|
1169 | #endif /* __rtems__ */ |
---|
1170 | adv = len - short_uio.uio_resid; |
---|
1171 | |
---|
1172 | uio_clone->uio_iov->iov_base = |
---|
1173 | (char *)uio_clone->uio_iov->iov_base + adv; |
---|
1174 | uio_clone->uio_iov->iov_len -= adv; |
---|
1175 | uio_clone->uio_resid -= adv; |
---|
1176 | uio_clone->uio_offset += adv; |
---|
1177 | |
---|
1178 | uio->uio_resid -= adv; |
---|
1179 | uio->uio_offset += adv; |
---|
1180 | |
---|
1181 | if (error != 0 || adv == 0) |
---|
1182 | break; |
---|
1183 | } |
---|
1184 | #ifndef __rtems__ |
---|
1185 | td->td_ma = prev_td_ma; |
---|
1186 | td->td_ma_cnt = prev_td_ma_cnt; |
---|
1187 | curthread_pflags_restore(saveheld); |
---|
1188 | #endif /* __rtems__ */ |
---|
1189 | out: |
---|
1190 | #ifndef __rtems__ |
---|
1191 | free(uio_clone, M_IOV); |
---|
1192 | #endif /* __rtems__ */ |
---|
1193 | return (error); |
---|
1194 | } |
---|
1195 | |
---|
1196 | static int |
---|
1197 | vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, |
---|
1198 | int flags, struct thread *td) |
---|
1199 | { |
---|
1200 | #ifdef __rtems__ |
---|
1201 | fo_rdwr_t *doio; |
---|
1202 | struct vnode *vp; |
---|
1203 | void *rl_cookie; |
---|
1204 | struct vn_io_fault_args args; |
---|
1205 | int error; |
---|
1206 | |
---|
1207 | doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; |
---|
1208 | vp = fp->f_vnode; |
---|
1209 | foffset_lock_uio(fp, uio, flags); |
---|
1210 | if (do_vn_io_fault(vp, uio)) { |
---|
1211 | args.kind = VN_IO_FAULT_FOP; |
---|
1212 | args.args.fop_args.fp = fp; |
---|
1213 | args.args.fop_args.doio = doio; |
---|
1214 | args.cred = active_cred; |
---|
1215 | args.flags = flags | FOF_OFFSET; |
---|
1216 | if (uio->uio_rw == UIO_READ) { |
---|
1217 | rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, |
---|
1218 | uio->uio_offset + uio->uio_resid); |
---|
1219 | } else if ((fp->f_flag & O_APPEND) != 0 || |
---|
1220 | (flags & FOF_OFFSET) == 0) { |
---|
1221 | /* For appenders, punt and lock the whole range. */ |
---|
1222 | rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); |
---|
1223 | } else { |
---|
1224 | rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, |
---|
1225 | uio->uio_offset + uio->uio_resid); |
---|
1226 | } |
---|
1227 | error = vn_io_fault1(vp, uio, &args, td); |
---|
1228 | vn_rangelock_unlock(vp, rl_cookie); |
---|
1229 | } else { |
---|
1230 | error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); |
---|
1231 | } |
---|
1232 | foffset_unlock_uio(fp, uio, flags); |
---|
1233 | return (error); |
---|
1234 | #else /* __rtems__ */ |
---|
1235 | return (EFAULT); |
---|
1236 | #endif /* __rtems__ */ |
---|
1237 | } |
---|
1238 | |
---|
1239 | /* |
---|
1240 | * Helper function to perform the requested uiomove operation using |
---|
1241 | * the held pages for io->uio_iov[0].iov_base buffer instead of |
---|
1242 | * copyin/copyout. Access to the pages with uiomove_fromphys() |
---|
1243 | * instead of iov_base prevents page faults that could occur due to |
---|
1244 | * pmap_collect() invalidating the mapping created by |
---|
1245 | * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or |
---|
1246 | * object cleanup revoking the write access from page mappings. |
---|
1247 | * |
---|
1248 | * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() |
---|
1249 | * instead of plain uiomove(). |
---|
1250 | */ |
---|
1251 | int |
---|
1252 | vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) |
---|
1253 | { |
---|
1254 | #ifndef __rtems__ |
---|
1255 | return (EFAULT); |
---|
1256 | struct uio transp_uio; |
---|
1257 | struct iovec transp_iov[1]; |
---|
1258 | struct thread *td; |
---|
1259 | size_t adv; |
---|
1260 | int error, pgadv; |
---|
1261 | |
---|
1262 | td = curthread; |
---|
1263 | if ((td->td_pflags & TDP_UIOHELD) == 0 || |
---|
1264 | uio->uio_segflg != UIO_USERSPACE) |
---|
1265 | #endif /* __rtems__ */ |
---|
1266 | return (uiomove(data, xfersize, uio)); |
---|
1267 | |
---|
1268 | #ifndef __rtems__ |
---|
1269 | KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); |
---|
1270 | transp_iov[0].iov_base = data; |
---|
1271 | transp_uio.uio_iov = &transp_iov[0]; |
---|
1272 | transp_uio.uio_iovcnt = 1; |
---|
1273 | if (xfersize > uio->uio_resid) |
---|
1274 | xfersize = uio->uio_resid; |
---|
1275 | transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; |
---|
1276 | transp_uio.uio_offset = 0; |
---|
1277 | transp_uio.uio_segflg = UIO_SYSSPACE; |
---|
1278 | /* |
---|
1279 | * Since transp_iov points to data, and td_ma page array |
---|
1280 | * corresponds to original uio->uio_iov, we need to invert the |
---|
1281 | * direction of the i/o operation as passed to |
---|
1282 | * uiomove_fromphys(). |
---|
1283 | */ |
---|
1284 | switch (uio->uio_rw) { |
---|
1285 | case UIO_WRITE: |
---|
1286 | transp_uio.uio_rw = UIO_READ; |
---|
1287 | break; |
---|
1288 | case UIO_READ: |
---|
1289 | transp_uio.uio_rw = UIO_WRITE; |
---|
1290 | break; |
---|
1291 | } |
---|
1292 | transp_uio.uio_td = uio->uio_td; |
---|
1293 | error = uiomove_fromphys(td->td_ma, |
---|
1294 | ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, |
---|
1295 | xfersize, &transp_uio); |
---|
1296 | adv = xfersize - transp_uio.uio_resid; |
---|
1297 | pgadv = |
---|
1298 | (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - |
---|
1299 | (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); |
---|
1300 | td->td_ma += pgadv; |
---|
1301 | KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, |
---|
1302 | pgadv)); |
---|
1303 | td->td_ma_cnt -= pgadv; |
---|
1304 | uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; |
---|
1305 | uio->uio_iov->iov_len -= adv; |
---|
1306 | uio->uio_resid -= adv; |
---|
1307 | uio->uio_offset += adv; |
---|
1308 | return (error); |
---|
1309 | #endif /* __rtems__ */ |
---|
1310 | } |
---|
1311 | |
---|
1312 | int |
---|
1313 | vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, |
---|
1314 | struct uio *uio) |
---|
1315 | { |
---|
1316 | #ifndef __rtems__ |
---|
1317 | struct thread *td; |
---|
1318 | vm_offset_t iov_base; |
---|
1319 | int cnt, pgadv; |
---|
1320 | |
---|
1321 | td = curthread; |
---|
1322 | if ((td->td_pflags & TDP_UIOHELD) == 0 || |
---|
1323 | uio->uio_segflg != UIO_USERSPACE) |
---|
1324 | return (uiomove_fromphys(ma, offset, xfersize, uio)); |
---|
1325 | |
---|
1326 | KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); |
---|
1327 | cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; |
---|
1328 | iov_base = (vm_offset_t)uio->uio_iov->iov_base; |
---|
1329 | switch (uio->uio_rw) { |
---|
1330 | case UIO_WRITE: |
---|
1331 | pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, |
---|
1332 | offset, cnt); |
---|
1333 | break; |
---|
1334 | case UIO_READ: |
---|
1335 | pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, |
---|
1336 | cnt); |
---|
1337 | break; |
---|
1338 | } |
---|
1339 | pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); |
---|
1340 | td->td_ma += pgadv; |
---|
1341 | KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, |
---|
1342 | pgadv)); |
---|
1343 | td->td_ma_cnt -= pgadv; |
---|
1344 | uio->uio_iov->iov_base = (char *)(iov_base + cnt); |
---|
1345 | uio->uio_iov->iov_len -= cnt; |
---|
1346 | uio->uio_resid -= cnt; |
---|
1347 | uio->uio_offset += cnt; |
---|
1348 | return (0); |
---|
1349 | #else /* __rtems__ */ |
---|
1350 | return (EFAULT); |
---|
1351 | #endif /* __rtems__ */ |
---|
1352 | } |
---|
1353 | |
---|
1354 | |
---|
1355 | /* |
---|
1356 | * File table truncate routine. |
---|
1357 | */ |
---|
1358 | static int |
---|
1359 | vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, |
---|
1360 | struct thread *td) |
---|
1361 | { |
---|
1362 | struct vattr vattr; |
---|
1363 | struct mount *mp; |
---|
1364 | struct vnode *vp; |
---|
1365 | void *rl_cookie; |
---|
1366 | int error; |
---|
1367 | |
---|
1368 | vp = fp->f_vnode; |
---|
1369 | |
---|
1370 | /* |
---|
1371 | * Lock the whole range for truncation. Otherwise split i/o |
---|
1372 | * might happen partly before and partly after the truncation. |
---|
1373 | */ |
---|
1374 | rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); |
---|
1375 | error = vn_start_write(vp, &mp, V_WAIT | PCATCH); |
---|
1376 | if (error) |
---|
1377 | goto out1; |
---|
1378 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
---|
1379 | AUDIT_ARG_VNODE1(vp); |
---|
1380 | if (vp->v_type == VDIR) { |
---|
1381 | error = EISDIR; |
---|
1382 | goto out; |
---|
1383 | } |
---|
1384 | #ifdef MAC |
---|
1385 | error = mac_vnode_check_write(active_cred, fp->f_cred, vp); |
---|
1386 | if (error) |
---|
1387 | goto out; |
---|
1388 | #endif |
---|
1389 | error = VOP_ADD_WRITECOUNT(vp, 1); |
---|
1390 | if (error == 0) { |
---|
1391 | VATTR_NULL(&vattr); |
---|
1392 | vattr.va_size = length; |
---|
1393 | if ((fp->f_flag & O_FSYNC) != 0) |
---|
1394 | vattr.va_vaflags |= VA_SYNC; |
---|
1395 | error = VOP_SETATTR(vp, &vattr, fp->f_cred); |
---|
1396 | VOP_ADD_WRITECOUNT_CHECKED(vp, -1); |
---|
1397 | } |
---|
1398 | out: |
---|
1399 | VOP_UNLOCK(vp, 0); |
---|
1400 | vn_finished_write(mp); |
---|
1401 | out1: |
---|
1402 | vn_rangelock_unlock(vp, rl_cookie); |
---|
1403 | return (error); |
---|
1404 | } |
---|
1405 | |
---|
1406 | /* |
---|
1407 | * File table vnode stat routine. |
---|
1408 | */ |
---|
1409 | static int |
---|
1410 | vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, |
---|
1411 | struct thread *td) |
---|
1412 | { |
---|
1413 | struct vnode *vp = fp->f_vnode; |
---|
1414 | int error; |
---|
1415 | |
---|
1416 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
1417 | error = vn_stat(vp, sb, active_cred, fp->f_cred, td); |
---|
1418 | VOP_UNLOCK(vp, 0); |
---|
1419 | |
---|
1420 | return (error); |
---|
1421 | } |
---|
1422 | |
---|
1423 | /* |
---|
1424 | * Stat a vnode; implementation for the stat syscall |
---|
1425 | */ |
---|
1426 | int |
---|
1427 | vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, |
---|
1428 | struct ucred *file_cred, struct thread *td) |
---|
1429 | { |
---|
1430 | struct vattr vattr; |
---|
1431 | struct vattr *vap; |
---|
1432 | int error; |
---|
1433 | u_short mode; |
---|
1434 | |
---|
1435 | AUDIT_ARG_VNODE1(vp); |
---|
1436 | #ifdef MAC |
---|
1437 | error = mac_vnode_check_stat(active_cred, file_cred, vp); |
---|
1438 | if (error) |
---|
1439 | return (error); |
---|
1440 | #endif |
---|
1441 | |
---|
1442 | vap = &vattr; |
---|
1443 | |
---|
1444 | /* |
---|
1445 | * Initialize defaults for new and unusual fields, so that file |
---|
1446 | * systems which don't support these fields don't need to know |
---|
1447 | * about them. |
---|
1448 | */ |
---|
1449 | vap->va_birthtime.tv_sec = -1; |
---|
1450 | vap->va_birthtime.tv_nsec = 0; |
---|
1451 | vap->va_fsid = VNOVAL; |
---|
1452 | vap->va_rdev = NODEV; |
---|
1453 | |
---|
1454 | error = VOP_GETATTR(vp, vap, active_cred); |
---|
1455 | if (error) |
---|
1456 | return (error); |
---|
1457 | |
---|
1458 | /* |
---|
1459 | * Zero the spare stat fields |
---|
1460 | */ |
---|
1461 | bzero(sb, sizeof *sb); |
---|
1462 | |
---|
1463 | /* |
---|
1464 | * Copy from vattr table |
---|
1465 | */ |
---|
1466 | if (vap->va_fsid != VNOVAL) |
---|
1467 | sb->st_dev = vap->va_fsid; |
---|
1468 | else |
---|
1469 | sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; |
---|
1470 | sb->st_ino = vap->va_fileid; |
---|
1471 | mode = vap->va_mode; |
---|
1472 | switch (vap->va_type) { |
---|
1473 | case VREG: |
---|
1474 | mode |= S_IFREG; |
---|
1475 | break; |
---|
1476 | case VDIR: |
---|
1477 | mode |= S_IFDIR; |
---|
1478 | break; |
---|
1479 | case VBLK: |
---|
1480 | mode |= S_IFBLK; |
---|
1481 | break; |
---|
1482 | case VCHR: |
---|
1483 | mode |= S_IFCHR; |
---|
1484 | break; |
---|
1485 | case VLNK: |
---|
1486 | mode |= S_IFLNK; |
---|
1487 | break; |
---|
1488 | case VSOCK: |
---|
1489 | mode |= S_IFSOCK; |
---|
1490 | break; |
---|
1491 | case VFIFO: |
---|
1492 | mode |= S_IFIFO; |
---|
1493 | break; |
---|
1494 | default: |
---|
1495 | return (EBADF); |
---|
1496 | } |
---|
1497 | sb->st_mode = mode; |
---|
1498 | sb->st_nlink = vap->va_nlink; |
---|
1499 | sb->st_uid = vap->va_uid; |
---|
1500 | sb->st_gid = vap->va_gid; |
---|
1501 | sb->st_rdev = vap->va_rdev; |
---|
1502 | if (vap->va_size > OFF_MAX) |
---|
1503 | return (EOVERFLOW); |
---|
1504 | sb->st_size = vap->va_size; |
---|
1505 | sb->st_atim = vap->va_atime; |
---|
1506 | sb->st_mtim = vap->va_mtime; |
---|
1507 | sb->st_ctim = vap->va_ctime; |
---|
1508 | #ifndef __rtems__ |
---|
1509 | sb->st_birthtim = vap->va_birthtime; |
---|
1510 | #endif /* __rtems__ */ |
---|
1511 | |
---|
1512 | /* |
---|
1513 | * According to www.opengroup.org, the meaning of st_blksize is |
---|
1514 | * "a filesystem-specific preferred I/O block size for this |
---|
1515 | * object. In some filesystem types, this may vary from file |
---|
1516 | * to file" |
---|
1517 | * Use miminum/default of PAGE_SIZE (e.g. for VCHR). |
---|
1518 | */ |
---|
1519 | |
---|
1520 | sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); |
---|
1521 | |
---|
1522 | #ifndef __rtems__ |
---|
1523 | sb->st_flags = vap->va_flags; |
---|
1524 | if (priv_check(td, PRIV_VFS_GENERATION)) |
---|
1525 | sb->st_gen = 0; |
---|
1526 | else |
---|
1527 | sb->st_gen = vap->va_gen; |
---|
1528 | #endif /* __rtems__ */ |
---|
1529 | |
---|
1530 | sb->st_blocks = vap->va_bytes / S_BLKSIZE; |
---|
1531 | return (0); |
---|
1532 | } |
---|
1533 | |
---|
1534 | /* |
---|
1535 | * File table vnode ioctl routine. |
---|
1536 | */ |
---|
1537 | static int |
---|
1538 | vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, |
---|
1539 | struct thread *td) |
---|
1540 | { |
---|
1541 | struct vattr vattr; |
---|
1542 | struct vnode *vp; |
---|
1543 | struct fiobmap2_arg *bmarg; |
---|
1544 | int error; |
---|
1545 | |
---|
1546 | vp = fp->f_vnode; |
---|
1547 | switch (vp->v_type) { |
---|
1548 | case VDIR: |
---|
1549 | case VREG: |
---|
1550 | switch (com) { |
---|
1551 | case FIONREAD: |
---|
1552 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
1553 | error = VOP_GETATTR(vp, &vattr, active_cred); |
---|
1554 | VOP_UNLOCK(vp, 0); |
---|
1555 | if (error == 0) |
---|
1556 | *(int *)data = vattr.va_size - fp->f_offset; |
---|
1557 | return (error); |
---|
1558 | #ifndef __rtems__ |
---|
1559 | case FIOBMAP2: |
---|
1560 | bmarg = (struct fiobmap2_arg *)data; |
---|
1561 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
1562 | #ifdef MAC |
---|
1563 | error = mac_vnode_check_read(active_cred, fp->f_cred, |
---|
1564 | vp); |
---|
1565 | if (error == 0) |
---|
1566 | #endif |
---|
1567 | error = VOP_BMAP(vp, bmarg->bn, NULL, |
---|
1568 | &bmarg->bn, &bmarg->runp, &bmarg->runb); |
---|
1569 | VOP_UNLOCK(vp, 0); |
---|
1570 | return (error); |
---|
1571 | #endif /* __rtems__ */ |
---|
1572 | case FIONBIO: |
---|
1573 | case FIOASYNC: |
---|
1574 | return (0); |
---|
1575 | default: |
---|
1576 | return (VOP_IOCTL(vp, com, data, fp->f_flag, |
---|
1577 | active_cred, td)); |
---|
1578 | } |
---|
1579 | break; |
---|
1580 | case VCHR: |
---|
1581 | return (VOP_IOCTL(vp, com, data, fp->f_flag, |
---|
1582 | active_cred, td)); |
---|
1583 | default: |
---|
1584 | return (ENOTTY); |
---|
1585 | } |
---|
1586 | } |
---|
1587 | |
---|
1588 | /* |
---|
1589 | * File table vnode poll routine. |
---|
1590 | */ |
---|
1591 | static int |
---|
1592 | vn_poll(struct file *fp, int events, struct ucred *active_cred, |
---|
1593 | struct thread *td) |
---|
1594 | { |
---|
1595 | struct vnode *vp; |
---|
1596 | int error; |
---|
1597 | |
---|
1598 | vp = fp->f_vnode; |
---|
1599 | #ifdef MAC |
---|
1600 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
---|
1601 | AUDIT_ARG_VNODE1(vp); |
---|
1602 | error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); |
---|
1603 | VOP_UNLOCK(vp, 0); |
---|
1604 | if (!error) |
---|
1605 | #endif |
---|
1606 | |
---|
1607 | error = VOP_POLL(vp, events, fp->f_cred, td); |
---|
1608 | return (error); |
---|
1609 | } |
---|
1610 | |
---|
1611 | /* |
---|
1612 | * Acquire the requested lock and then check for validity. LK_RETRY |
---|
1613 | * permits vn_lock to return doomed vnodes. |
---|
1614 | */ |
---|
1615 | int |
---|
1616 | _vn_lock(struct vnode *vp, int flags, char *file, int line) |
---|
1617 | { |
---|
1618 | int error; |
---|
1619 | |
---|
1620 | VNASSERT((flags & LK_TYPE_MASK) != 0, vp, |
---|
1621 | ("vn_lock: no locktype")); |
---|
1622 | VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count")); |
---|
1623 | retry: |
---|
1624 | error = VOP_LOCK1(vp, flags, file, line); |
---|
1625 | flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ |
---|
1626 | KASSERT((flags & LK_RETRY) == 0 || error == 0, |
---|
1627 | ("vn_lock: error %d incompatible with flags %#x", error, flags)); |
---|
1628 | |
---|
1629 | if ((flags & LK_RETRY) == 0) { |
---|
1630 | if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { |
---|
1631 | VOP_UNLOCK(vp, 0); |
---|
1632 | error = ENOENT; |
---|
1633 | } |
---|
1634 | } else if (error != 0) |
---|
1635 | goto retry; |
---|
1636 | return (error); |
---|
1637 | } |
---|
1638 | |
---|
1639 | /* |
---|
1640 | * File table vnode close routine. |
---|
1641 | */ |
---|
1642 | static int |
---|
1643 | vn_closefile(struct file *fp, struct thread *td) |
---|
1644 | { |
---|
1645 | struct vnode *vp; |
---|
1646 | struct flock lf; |
---|
1647 | int error; |
---|
1648 | bool ref; |
---|
1649 | |
---|
1650 | vp = fp->f_vnode; |
---|
1651 | fp->f_ops = &badfileops; |
---|
1652 | #ifndef __rtems__ |
---|
1653 | ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; |
---|
1654 | #else /* __rtems__ */ |
---|
1655 | ref = false; |
---|
1656 | #endif /* __rtems__ */ |
---|
1657 | |
---|
1658 | error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); |
---|
1659 | |
---|
1660 | if (__predict_false(ref)) { |
---|
1661 | lf.l_whence = SEEK_SET; |
---|
1662 | lf.l_start = 0; |
---|
1663 | lf.l_len = 0; |
---|
1664 | lf.l_type = F_UNLCK; |
---|
1665 | (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); |
---|
1666 | vrele(vp); |
---|
1667 | } |
---|
1668 | return (error); |
---|
1669 | } |
---|
1670 | |
---|
1671 | static bool |
---|
1672 | vn_suspendable(struct mount *mp) |
---|
1673 | { |
---|
1674 | |
---|
1675 | return (mp->mnt_op->vfs_susp_clean != NULL); |
---|
1676 | } |
---|
1677 | |
---|
1678 | /* |
---|
1679 | * Preparing to start a filesystem write operation. If the operation is |
---|
1680 | * permitted, then we bump the count of operations in progress and |
---|
1681 | * proceed. If a suspend request is in progress, we wait until the |
---|
1682 | * suspension is over, and then proceed. |
---|
1683 | */ |
---|
1684 | static int |
---|
1685 | vn_start_write_locked(struct mount *mp, int flags) |
---|
1686 | { |
---|
1687 | int error, mflags; |
---|
1688 | |
---|
1689 | mtx_assert(MNT_MTX(mp), MA_OWNED); |
---|
1690 | error = 0; |
---|
1691 | |
---|
1692 | /* |
---|
1693 | * Check on status of suspension. |
---|
1694 | */ |
---|
1695 | #ifndef __rtems__ |
---|
1696 | if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || |
---|
1697 | #else /* __rtems__ */ |
---|
1698 | if ( |
---|
1699 | #endif /* __rtems__ */ |
---|
1700 | mp->mnt_susp_owner != curthread) { |
---|
1701 | mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? |
---|
1702 | (flags & PCATCH) : 0) | (PUSER - 1); |
---|
1703 | while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { |
---|
1704 | if (flags & V_NOWAIT) { |
---|
1705 | error = EWOULDBLOCK; |
---|
1706 | goto unlock; |
---|
1707 | } |
---|
1708 | error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, |
---|
1709 | "suspfs", 0); |
---|
1710 | if (error) |
---|
1711 | goto unlock; |
---|
1712 | } |
---|
1713 | } |
---|
1714 | if (flags & V_XSLEEP) |
---|
1715 | goto unlock; |
---|
1716 | mp->mnt_writeopcount++; |
---|
1717 | unlock: |
---|
1718 | if (error != 0 || (flags & V_XSLEEP) != 0) |
---|
1719 | MNT_REL(mp); |
---|
1720 | MNT_IUNLOCK(mp); |
---|
1721 | return (error); |
---|
1722 | } |
---|
1723 | |
---|
1724 | int |
---|
1725 | vn_start_write(struct vnode *vp, struct mount **mpp, int flags) |
---|
1726 | { |
---|
1727 | struct mount *mp; |
---|
1728 | int error; |
---|
1729 | |
---|
1730 | KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), |
---|
1731 | ("V_MNTREF requires mp")); |
---|
1732 | |
---|
1733 | error = 0; |
---|
1734 | /* |
---|
1735 | * If a vnode is provided, get and return the mount point that |
---|
1736 | * to which it will write. |
---|
1737 | */ |
---|
1738 | if (vp != NULL) { |
---|
1739 | if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { |
---|
1740 | *mpp = NULL; |
---|
1741 | if (error != EOPNOTSUPP) |
---|
1742 | return (error); |
---|
1743 | return (0); |
---|
1744 | } |
---|
1745 | } |
---|
1746 | if ((mp = *mpp) == NULL) |
---|
1747 | return (0); |
---|
1748 | |
---|
1749 | if (!vn_suspendable(mp)) { |
---|
1750 | if (vp != NULL || (flags & V_MNTREF) != 0) |
---|
1751 | vfs_rel(mp); |
---|
1752 | return (0); |
---|
1753 | } |
---|
1754 | |
---|
1755 | /* |
---|
1756 | * VOP_GETWRITEMOUNT() returns with the mp refcount held through |
---|
1757 | * a vfs_ref(). |
---|
1758 | * As long as a vnode is not provided we need to acquire a |
---|
1759 | * refcount for the provided mountpoint too, in order to |
---|
1760 | * emulate a vfs_ref(). |
---|
1761 | */ |
---|
1762 | MNT_ILOCK(mp); |
---|
1763 | if (vp == NULL && (flags & V_MNTREF) == 0) |
---|
1764 | MNT_REF(mp); |
---|
1765 | |
---|
1766 | return (vn_start_write_locked(mp, flags)); |
---|
1767 | } |
---|
1768 | |
---|
1769 | /* |
---|
1770 | * Secondary suspension. Used by operations such as vop_inactive |
---|
1771 | * routines that are needed by the higher level functions. These |
---|
1772 | * are allowed to proceed until all the higher level functions have |
---|
1773 | * completed (indicated by mnt_writeopcount dropping to zero). At that |
---|
1774 | * time, these operations are halted until the suspension is over. |
---|
1775 | */ |
---|
1776 | int |
---|
1777 | vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) |
---|
1778 | { |
---|
1779 | struct mount *mp; |
---|
1780 | int error; |
---|
1781 | |
---|
1782 | KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), |
---|
1783 | ("V_MNTREF requires mp")); |
---|
1784 | |
---|
1785 | retry: |
---|
1786 | if (vp != NULL) { |
---|
1787 | if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { |
---|
1788 | *mpp = NULL; |
---|
1789 | if (error != EOPNOTSUPP) |
---|
1790 | return (error); |
---|
1791 | return (0); |
---|
1792 | } |
---|
1793 | } |
---|
1794 | /* |
---|
1795 | * If we are not suspended or have not yet reached suspended |
---|
1796 | * mode, then let the operation proceed. |
---|
1797 | */ |
---|
1798 | if ((mp = *mpp) == NULL) |
---|
1799 | return (0); |
---|
1800 | |
---|
1801 | if (!vn_suspendable(mp)) { |
---|
1802 | if (vp != NULL || (flags & V_MNTREF) != 0) |
---|
1803 | vfs_rel(mp); |
---|
1804 | return (0); |
---|
1805 | } |
---|
1806 | |
---|
1807 | /* |
---|
1808 | * VOP_GETWRITEMOUNT() returns with the mp refcount held through |
---|
1809 | * a vfs_ref(). |
---|
1810 | * As long as a vnode is not provided we need to acquire a |
---|
1811 | * refcount for the provided mountpoint too, in order to |
---|
1812 | * emulate a vfs_ref(). |
---|
1813 | */ |
---|
1814 | MNT_ILOCK(mp); |
---|
1815 | if (vp == NULL && (flags & V_MNTREF) == 0) |
---|
1816 | MNT_REF(mp); |
---|
1817 | if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { |
---|
1818 | mp->mnt_secondary_writes++; |
---|
1819 | mp->mnt_secondary_accwrites++; |
---|
1820 | MNT_IUNLOCK(mp); |
---|
1821 | return (0); |
---|
1822 | } |
---|
1823 | if (flags & V_NOWAIT) { |
---|
1824 | MNT_REL(mp); |
---|
1825 | MNT_IUNLOCK(mp); |
---|
1826 | return (EWOULDBLOCK); |
---|
1827 | } |
---|
1828 | /* |
---|
1829 | * Wait for the suspension to finish. |
---|
1830 | */ |
---|
1831 | error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | |
---|
1832 | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), |
---|
1833 | "suspfs", 0); |
---|
1834 | vfs_rel(mp); |
---|
1835 | if (error == 0) |
---|
1836 | goto retry; |
---|
1837 | return (error); |
---|
1838 | } |
---|
1839 | |
---|
1840 | /* |
---|
1841 | * Filesystem write operation has completed. If we are suspending and this |
---|
1842 | * operation is the last one, notify the suspender that the suspension is |
---|
1843 | * now in effect. |
---|
1844 | */ |
---|
1845 | void |
---|
1846 | vn_finished_write(struct mount *mp) |
---|
1847 | { |
---|
1848 | if (mp == NULL || !vn_suspendable(mp)) |
---|
1849 | return; |
---|
1850 | MNT_ILOCK(mp); |
---|
1851 | MNT_REL(mp); |
---|
1852 | mp->mnt_writeopcount--; |
---|
1853 | if (mp->mnt_writeopcount < 0) |
---|
1854 | panic("vn_finished_write: neg cnt"); |
---|
1855 | if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && |
---|
1856 | mp->mnt_writeopcount <= 0) |
---|
1857 | wakeup(&mp->mnt_writeopcount); |
---|
1858 | MNT_IUNLOCK(mp); |
---|
1859 | } |
---|
1860 | |
---|
1861 | |
---|
1862 | /* |
---|
1863 | * Filesystem secondary write operation has completed. If we are |
---|
1864 | * suspending and this operation is the last one, notify the suspender |
---|
1865 | * that the suspension is now in effect. |
---|
1866 | */ |
---|
1867 | void |
---|
1868 | vn_finished_secondary_write(struct mount *mp) |
---|
1869 | { |
---|
1870 | if (mp == NULL || !vn_suspendable(mp)) |
---|
1871 | return; |
---|
1872 | MNT_ILOCK(mp); |
---|
1873 | MNT_REL(mp); |
---|
1874 | mp->mnt_secondary_writes--; |
---|
1875 | if (mp->mnt_secondary_writes < 0) |
---|
1876 | panic("vn_finished_secondary_write: neg cnt"); |
---|
1877 | if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && |
---|
1878 | mp->mnt_secondary_writes <= 0) |
---|
1879 | wakeup(&mp->mnt_secondary_writes); |
---|
1880 | MNT_IUNLOCK(mp); |
---|
1881 | } |
---|
1882 | |
---|
1883 | |
---|
1884 | |
---|
1885 | /* |
---|
1886 | * Request a filesystem to suspend write operations. |
---|
1887 | */ |
---|
1888 | int |
---|
1889 | vfs_write_suspend(struct mount *mp, int flags) |
---|
1890 | { |
---|
1891 | int error; |
---|
1892 | |
---|
1893 | MPASS(vn_suspendable(mp)); |
---|
1894 | |
---|
1895 | MNT_ILOCK(mp); |
---|
1896 | if (mp->mnt_susp_owner == curthread) { |
---|
1897 | MNT_IUNLOCK(mp); |
---|
1898 | return (EALREADY); |
---|
1899 | } |
---|
1900 | while (mp->mnt_kern_flag & MNTK_SUSPEND) |
---|
1901 | msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); |
---|
1902 | |
---|
1903 | /* |
---|
1904 | * Unmount holds a write reference on the mount point. If we |
---|
1905 | * own busy reference and drain for writers, we deadlock with |
---|
1906 | * the reference draining in the unmount path. Callers of |
---|
1907 | * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if |
---|
1908 | * vfs_busy() reference is owned and caller is not in the |
---|
1909 | * unmount context. |
---|
1910 | */ |
---|
1911 | if ((flags & VS_SKIP_UNMOUNT) != 0 && |
---|
1912 | (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { |
---|
1913 | MNT_IUNLOCK(mp); |
---|
1914 | return (EBUSY); |
---|
1915 | } |
---|
1916 | |
---|
1917 | mp->mnt_kern_flag |= MNTK_SUSPEND; |
---|
1918 | mp->mnt_susp_owner = curthread; |
---|
1919 | if (mp->mnt_writeopcount > 0) |
---|
1920 | (void) msleep(&mp->mnt_writeopcount, |
---|
1921 | MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); |
---|
1922 | else |
---|
1923 | MNT_IUNLOCK(mp); |
---|
1924 | if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) |
---|
1925 | vfs_write_resume(mp, 0); |
---|
1926 | return (error); |
---|
1927 | } |
---|
1928 | |
---|
1929 | /* |
---|
1930 | * Request a filesystem to resume write operations. |
---|
1931 | */ |
---|
1932 | void |
---|
1933 | vfs_write_resume(struct mount *mp, int flags) |
---|
1934 | { |
---|
1935 | |
---|
1936 | MPASS(vn_suspendable(mp)); |
---|
1937 | |
---|
1938 | MNT_ILOCK(mp); |
---|
1939 | if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { |
---|
1940 | KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); |
---|
1941 | mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | |
---|
1942 | MNTK_SUSPENDED); |
---|
1943 | mp->mnt_susp_owner = NULL; |
---|
1944 | wakeup(&mp->mnt_writeopcount); |
---|
1945 | wakeup(&mp->mnt_flag); |
---|
1946 | #ifndef __rtems__ |
---|
1947 | curthread->td_pflags &= ~TDP_IGNSUSP; |
---|
1948 | #endif /* __rtems__ */ |
---|
1949 | if ((flags & VR_START_WRITE) != 0) { |
---|
1950 | MNT_REF(mp); |
---|
1951 | mp->mnt_writeopcount++; |
---|
1952 | } |
---|
1953 | MNT_IUNLOCK(mp); |
---|
1954 | if ((flags & VR_NO_SUSPCLR) == 0) |
---|
1955 | VFS_SUSP_CLEAN(mp); |
---|
1956 | } else if ((flags & VR_START_WRITE) != 0) { |
---|
1957 | MNT_REF(mp); |
---|
1958 | vn_start_write_locked(mp, 0); |
---|
1959 | } else { |
---|
1960 | MNT_IUNLOCK(mp); |
---|
1961 | } |
---|
1962 | } |
---|
1963 | |
---|
1964 | /* |
---|
1965 | * Helper loop around vfs_write_suspend() for filesystem unmount VFS |
---|
1966 | * methods. |
---|
1967 | */ |
---|
1968 | int |
---|
1969 | vfs_write_suspend_umnt(struct mount *mp) |
---|
1970 | { |
---|
1971 | int error; |
---|
1972 | |
---|
1973 | MPASS(vn_suspendable(mp)); |
---|
1974 | KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, |
---|
1975 | ("vfs_write_suspend_umnt: recursed")); |
---|
1976 | |
---|
1977 | /* dounmount() already called vn_start_write(). */ |
---|
1978 | for (;;) { |
---|
1979 | vn_finished_write(mp); |
---|
1980 | error = vfs_write_suspend(mp, 0); |
---|
1981 | if (error != 0) { |
---|
1982 | vn_start_write(NULL, &mp, V_WAIT); |
---|
1983 | return (error); |
---|
1984 | } |
---|
1985 | MNT_ILOCK(mp); |
---|
1986 | if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) |
---|
1987 | break; |
---|
1988 | MNT_IUNLOCK(mp); |
---|
1989 | vn_start_write(NULL, &mp, V_WAIT); |
---|
1990 | } |
---|
1991 | mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); |
---|
1992 | wakeup(&mp->mnt_flag); |
---|
1993 | MNT_IUNLOCK(mp); |
---|
1994 | #ifndef __rtems__ |
---|
1995 | curthread->td_pflags |= TDP_IGNSUSP; |
---|
1996 | #endif /* __rtems__ */ |
---|
1997 | return (0); |
---|
1998 | } |
---|
1999 | |
---|
2000 | /* |
---|
2001 | * Implement kqueues for files by translating it to vnode operation. |
---|
2002 | */ |
---|
2003 | static int |
---|
2004 | vn_kqfilter(struct file *fp, struct knote *kn) |
---|
2005 | { |
---|
2006 | |
---|
2007 | return (VOP_KQFILTER(fp->f_vnode, kn)); |
---|
2008 | } |
---|
2009 | |
---|
2010 | /* |
---|
2011 | * Simplified in-kernel wrapper calls for extended attribute access. |
---|
2012 | * Both calls pass in a NULL credential, authorizing as "kernel" access. |
---|
2013 | * Set IO_NODELOCKED in ioflg if the vnode is already locked. |
---|
2014 | */ |
---|
2015 | int |
---|
2016 | vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, |
---|
2017 | const char *attrname, int *buflen, char *buf, struct thread *td) |
---|
2018 | { |
---|
2019 | struct uio auio; |
---|
2020 | struct iovec iov; |
---|
2021 | int error; |
---|
2022 | |
---|
2023 | iov.iov_len = *buflen; |
---|
2024 | iov.iov_base = buf; |
---|
2025 | |
---|
2026 | auio.uio_iov = &iov; |
---|
2027 | auio.uio_iovcnt = 1; |
---|
2028 | auio.uio_rw = UIO_READ; |
---|
2029 | auio.uio_segflg = UIO_SYSSPACE; |
---|
2030 | auio.uio_td = td; |
---|
2031 | auio.uio_offset = 0; |
---|
2032 | auio.uio_resid = *buflen; |
---|
2033 | |
---|
2034 | if ((ioflg & IO_NODELOCKED) == 0) |
---|
2035 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
2036 | |
---|
2037 | ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); |
---|
2038 | |
---|
2039 | /* authorize attribute retrieval as kernel */ |
---|
2040 | error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, |
---|
2041 | td); |
---|
2042 | |
---|
2043 | if ((ioflg & IO_NODELOCKED) == 0) |
---|
2044 | VOP_UNLOCK(vp, 0); |
---|
2045 | |
---|
2046 | if (error == 0) { |
---|
2047 | *buflen = *buflen - auio.uio_resid; |
---|
2048 | } |
---|
2049 | |
---|
2050 | return (error); |
---|
2051 | } |
---|
2052 | |
---|
2053 | /* |
---|
2054 | * XXX failure mode if partially written? |
---|
2055 | */ |
---|
2056 | int |
---|
2057 | vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, |
---|
2058 | const char *attrname, int buflen, char *buf, struct thread *td) |
---|
2059 | { |
---|
2060 | struct uio auio; |
---|
2061 | struct iovec iov; |
---|
2062 | struct mount *mp; |
---|
2063 | int error; |
---|
2064 | |
---|
2065 | iov.iov_len = buflen; |
---|
2066 | iov.iov_base = buf; |
---|
2067 | |
---|
2068 | auio.uio_iov = &iov; |
---|
2069 | auio.uio_iovcnt = 1; |
---|
2070 | auio.uio_rw = UIO_WRITE; |
---|
2071 | auio.uio_segflg = UIO_SYSSPACE; |
---|
2072 | auio.uio_td = td; |
---|
2073 | auio.uio_offset = 0; |
---|
2074 | auio.uio_resid = buflen; |
---|
2075 | |
---|
2076 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
2077 | if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) |
---|
2078 | return (error); |
---|
2079 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
---|
2080 | } |
---|
2081 | |
---|
2082 | ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); |
---|
2083 | |
---|
2084 | /* authorize attribute setting as kernel */ |
---|
2085 | error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); |
---|
2086 | |
---|
2087 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
2088 | vn_finished_write(mp); |
---|
2089 | VOP_UNLOCK(vp, 0); |
---|
2090 | } |
---|
2091 | |
---|
2092 | return (error); |
---|
2093 | } |
---|
2094 | |
---|
2095 | int |
---|
2096 | vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, |
---|
2097 | const char *attrname, struct thread *td) |
---|
2098 | { |
---|
2099 | struct mount *mp; |
---|
2100 | int error; |
---|
2101 | |
---|
2102 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
2103 | if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) |
---|
2104 | return (error); |
---|
2105 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
---|
2106 | } |
---|
2107 | |
---|
2108 | ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); |
---|
2109 | |
---|
2110 | /* authorize attribute removal as kernel */ |
---|
2111 | error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); |
---|
2112 | if (error == EOPNOTSUPP) |
---|
2113 | error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, |
---|
2114 | NULL, td); |
---|
2115 | |
---|
2116 | if ((ioflg & IO_NODELOCKED) == 0) { |
---|
2117 | vn_finished_write(mp); |
---|
2118 | VOP_UNLOCK(vp, 0); |
---|
2119 | } |
---|
2120 | |
---|
2121 | return (error); |
---|
2122 | } |
---|
2123 | |
---|
2124 | static int |
---|
2125 | vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, |
---|
2126 | struct vnode **rvp) |
---|
2127 | { |
---|
2128 | |
---|
2129 | return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); |
---|
2130 | } |
---|
2131 | |
---|
2132 | int |
---|
2133 | vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) |
---|
2134 | { |
---|
2135 | |
---|
2136 | return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, |
---|
2137 | lkflags, rvp)); |
---|
2138 | } |
---|
2139 | |
---|
2140 | int |
---|
2141 | vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, |
---|
2142 | int lkflags, struct vnode **rvp) |
---|
2143 | { |
---|
2144 | struct mount *mp; |
---|
2145 | int ltype, error; |
---|
2146 | |
---|
2147 | ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); |
---|
2148 | mp = vp->v_mount; |
---|
2149 | ltype = VOP_ISLOCKED(vp); |
---|
2150 | KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, |
---|
2151 | ("vn_vget_ino: vp not locked")); |
---|
2152 | error = vfs_busy(mp, MBF_NOWAIT); |
---|
2153 | if (error != 0) { |
---|
2154 | vfs_ref(mp); |
---|
2155 | VOP_UNLOCK(vp, 0); |
---|
2156 | error = vfs_busy(mp, 0); |
---|
2157 | vn_lock(vp, ltype | LK_RETRY); |
---|
2158 | vfs_rel(mp); |
---|
2159 | if (error != 0) |
---|
2160 | return (ENOENT); |
---|
2161 | if (vp->v_iflag & VI_DOOMED) { |
---|
2162 | vfs_unbusy(mp); |
---|
2163 | return (ENOENT); |
---|
2164 | } |
---|
2165 | } |
---|
2166 | VOP_UNLOCK(vp, 0); |
---|
2167 | error = alloc(mp, alloc_arg, lkflags, rvp); |
---|
2168 | vfs_unbusy(mp); |
---|
2169 | if (error != 0 || *rvp != vp) |
---|
2170 | vn_lock(vp, ltype | LK_RETRY); |
---|
2171 | if (vp->v_iflag & VI_DOOMED) { |
---|
2172 | if (error == 0) { |
---|
2173 | if (*rvp == vp) |
---|
2174 | vunref(vp); |
---|
2175 | else |
---|
2176 | vput(*rvp); |
---|
2177 | } |
---|
2178 | error = ENOENT; |
---|
2179 | } |
---|
2180 | return (error); |
---|
2181 | } |
---|
2182 | |
---|
2183 | int |
---|
2184 | vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, |
---|
2185 | struct thread *td) |
---|
2186 | { |
---|
2187 | |
---|
2188 | if (vp->v_type != VREG || td == NULL) |
---|
2189 | return (0); |
---|
2190 | if ((uoff_t)uio->uio_offset + uio->uio_resid > |
---|
2191 | lim_cur(td, RLIMIT_FSIZE)) { |
---|
2192 | #ifndef __rtems__ |
---|
2193 | PROC_LOCK(td->td_proc); |
---|
2194 | kern_psignal(td->td_proc, SIGXFSZ); |
---|
2195 | PROC_UNLOCK(td->td_proc); |
---|
2196 | #endif /* __rtems__ */ |
---|
2197 | return (EFBIG); |
---|
2198 | } |
---|
2199 | return (0); |
---|
2200 | } |
---|
2201 | |
---|
2202 | int |
---|
2203 | vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, |
---|
2204 | struct thread *td) |
---|
2205 | { |
---|
2206 | struct vnode *vp; |
---|
2207 | |
---|
2208 | vp = fp->f_vnode; |
---|
2209 | #ifdef AUDIT |
---|
2210 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
2211 | AUDIT_ARG_VNODE1(vp); |
---|
2212 | VOP_UNLOCK(vp, 0); |
---|
2213 | #endif |
---|
2214 | return (setfmode(td, active_cred, vp, mode)); |
---|
2215 | } |
---|
2216 | |
---|
2217 | int |
---|
2218 | vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, |
---|
2219 | struct thread *td) |
---|
2220 | { |
---|
2221 | struct vnode *vp; |
---|
2222 | |
---|
2223 | vp = fp->f_vnode; |
---|
2224 | #ifdef AUDIT |
---|
2225 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
2226 | AUDIT_ARG_VNODE1(vp); |
---|
2227 | VOP_UNLOCK(vp, 0); |
---|
2228 | #endif |
---|
2229 | return (setfown(td, active_cred, vp, uid, gid)); |
---|
2230 | } |
---|
2231 | |
---|
2232 | void |
---|
2233 | vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) |
---|
2234 | { |
---|
2235 | #ifndef __rtems__ |
---|
2236 | vm_object_t object; |
---|
2237 | |
---|
2238 | if ((object = vp->v_object) == NULL) |
---|
2239 | return; |
---|
2240 | VM_OBJECT_WLOCK(object); |
---|
2241 | vm_object_page_remove(object, start, end, 0); |
---|
2242 | VM_OBJECT_WUNLOCK(object); |
---|
2243 | #endif /* __rtems__ */ |
---|
2244 | } |
---|
2245 | |
---|
2246 | int |
---|
2247 | vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) |
---|
2248 | { |
---|
2249 | struct vattr va; |
---|
2250 | daddr_t bn, bnp; |
---|
2251 | uint64_t bsize; |
---|
2252 | off_t noff; |
---|
2253 | int error; |
---|
2254 | |
---|
2255 | KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, |
---|
2256 | ("Wrong command %lu", cmd)); |
---|
2257 | |
---|
2258 | if (vn_lock(vp, LK_SHARED) != 0) |
---|
2259 | return (EBADF); |
---|
2260 | if (vp->v_type != VREG) { |
---|
2261 | error = ENOTTY; |
---|
2262 | goto unlock; |
---|
2263 | } |
---|
2264 | error = VOP_GETATTR(vp, &va, cred); |
---|
2265 | if (error != 0) |
---|
2266 | goto unlock; |
---|
2267 | noff = *off; |
---|
2268 | if (noff >= va.va_size) { |
---|
2269 | error = ENXIO; |
---|
2270 | goto unlock; |
---|
2271 | } |
---|
2272 | bsize = vp->v_mount->mnt_stat.f_iosize; |
---|
2273 | for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - |
---|
2274 | noff % bsize) { |
---|
2275 | error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); |
---|
2276 | if (error == EOPNOTSUPP) { |
---|
2277 | error = ENOTTY; |
---|
2278 | goto unlock; |
---|
2279 | } |
---|
2280 | if ((bnp == -1 && cmd == FIOSEEKHOLE) || |
---|
2281 | (bnp != -1 && cmd == FIOSEEKDATA)) { |
---|
2282 | noff = bn * bsize; |
---|
2283 | if (noff < *off) |
---|
2284 | noff = *off; |
---|
2285 | goto unlock; |
---|
2286 | } |
---|
2287 | } |
---|
2288 | if (noff > va.va_size) |
---|
2289 | noff = va.va_size; |
---|
2290 | /* noff == va.va_size. There is an implicit hole at the end of file. */ |
---|
2291 | if (cmd == FIOSEEKDATA) |
---|
2292 | error = ENXIO; |
---|
2293 | unlock: |
---|
2294 | VOP_UNLOCK(vp, 0); |
---|
2295 | if (error == 0) |
---|
2296 | *off = noff; |
---|
2297 | return (error); |
---|
2298 | } |
---|
2299 | |
---|
2300 | int |
---|
2301 | vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) |
---|
2302 | { |
---|
2303 | struct ucred *cred; |
---|
2304 | struct vnode *vp; |
---|
2305 | struct vattr vattr; |
---|
2306 | off_t foffset, size; |
---|
2307 | int error, noneg; |
---|
2308 | |
---|
2309 | cred = td->td_ucred; |
---|
2310 | vp = fp->f_vnode; |
---|
2311 | foffset = foffset_lock(fp, 0); |
---|
2312 | noneg = (vp->v_type != VCHR); |
---|
2313 | error = 0; |
---|
2314 | switch (whence) { |
---|
2315 | case L_INCR: |
---|
2316 | if (noneg && |
---|
2317 | (foffset < 0 || |
---|
2318 | (offset > 0 && foffset > OFF_MAX - offset))) { |
---|
2319 | error = EOVERFLOW; |
---|
2320 | break; |
---|
2321 | } |
---|
2322 | offset += foffset; |
---|
2323 | break; |
---|
2324 | case L_XTND: |
---|
2325 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
2326 | error = VOP_GETATTR(vp, &vattr, cred); |
---|
2327 | VOP_UNLOCK(vp, 0); |
---|
2328 | if (error) |
---|
2329 | break; |
---|
2330 | |
---|
2331 | /* |
---|
2332 | * If the file references a disk device, then fetch |
---|
2333 | * the media size and use that to determine the ending |
---|
2334 | * offset. |
---|
2335 | */ |
---|
2336 | if (vattr.va_size == 0 && vp->v_type == VCHR && |
---|
2337 | fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) |
---|
2338 | vattr.va_size = size; |
---|
2339 | if (noneg && |
---|
2340 | (vattr.va_size > OFF_MAX || |
---|
2341 | (offset > 0 && vattr.va_size > OFF_MAX - offset))) { |
---|
2342 | error = EOVERFLOW; |
---|
2343 | break; |
---|
2344 | } |
---|
2345 | offset += vattr.va_size; |
---|
2346 | break; |
---|
2347 | case L_SET: |
---|
2348 | break; |
---|
2349 | case SEEK_DATA: |
---|
2350 | error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); |
---|
2351 | break; |
---|
2352 | case SEEK_HOLE: |
---|
2353 | error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); |
---|
2354 | break; |
---|
2355 | default: |
---|
2356 | error = EINVAL; |
---|
2357 | } |
---|
2358 | if (error == 0 && noneg && offset < 0) |
---|
2359 | error = EINVAL; |
---|
2360 | if (error != 0) |
---|
2361 | goto drop; |
---|
2362 | VFS_KNOTE_UNLOCKED(vp, 0); |
---|
2363 | td->td_uretoff.tdu_off = offset; |
---|
2364 | drop: |
---|
2365 | foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); |
---|
2366 | return (error); |
---|
2367 | } |
---|
2368 | |
---|
2369 | int |
---|
2370 | vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, |
---|
2371 | struct thread *td) |
---|
2372 | { |
---|
2373 | int error; |
---|
2374 | |
---|
2375 | /* |
---|
2376 | * Grant permission if the caller is the owner of the file, or |
---|
2377 | * the super-user, or has ACL_WRITE_ATTRIBUTES permission on |
---|
2378 | * on the file. If the time pointer is null, then write |
---|
2379 | * permission on the file is also sufficient. |
---|
2380 | * |
---|
2381 | * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: |
---|
2382 | * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES |
---|
2383 | * will be allowed to set the times [..] to the current |
---|
2384 | * server time. |
---|
2385 | */ |
---|
2386 | error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); |
---|
2387 | if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) |
---|
2388 | error = VOP_ACCESS(vp, VWRITE, cred, td); |
---|
2389 | return (error); |
---|
2390 | } |
---|
2391 | |
---|
2392 | int |
---|
2393 | vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) |
---|
2394 | { |
---|
2395 | struct vnode *vp; |
---|
2396 | int error; |
---|
2397 | |
---|
2398 | if (fp->f_type == DTYPE_FIFO) |
---|
2399 | kif->kf_type = KF_TYPE_FIFO; |
---|
2400 | else |
---|
2401 | kif->kf_type = KF_TYPE_VNODE; |
---|
2402 | vp = fp->f_vnode; |
---|
2403 | vref(vp); |
---|
2404 | FILEDESC_SUNLOCK(fdp); |
---|
2405 | error = vn_fill_kinfo_vnode(vp, kif); |
---|
2406 | vrele(vp); |
---|
2407 | FILEDESC_SLOCK(fdp); |
---|
2408 | return (error); |
---|
2409 | } |
---|
2410 | |
---|
2411 | static inline void |
---|
2412 | vn_fill_junk(struct kinfo_file *kif) |
---|
2413 | { |
---|
2414 | size_t len, olen; |
---|
2415 | |
---|
2416 | /* |
---|
2417 | * Simulate vn_fullpath returning changing values for a given |
---|
2418 | * vp during e.g. coredump. |
---|
2419 | */ |
---|
2420 | len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; |
---|
2421 | olen = strlen(kif->kf_path); |
---|
2422 | if (len < olen) |
---|
2423 | strcpy(&kif->kf_path[len - 1], "$"); |
---|
2424 | else |
---|
2425 | for (; olen < len; olen++) |
---|
2426 | strcpy(&kif->kf_path[olen], "A"); |
---|
2427 | } |
---|
2428 | |
---|
2429 | int |
---|
2430 | vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) |
---|
2431 | { |
---|
2432 | struct vattr va; |
---|
2433 | char *fullpath, *freepath; |
---|
2434 | int error; |
---|
2435 | |
---|
2436 | kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); |
---|
2437 | freepath = NULL; |
---|
2438 | fullpath = "-"; |
---|
2439 | error = vn_fullpath(curthread, vp, &fullpath, &freepath); |
---|
2440 | if (error == 0) { |
---|
2441 | strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); |
---|
2442 | } |
---|
2443 | if (freepath != NULL) |
---|
2444 | free(freepath, M_TEMP); |
---|
2445 | |
---|
2446 | KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, |
---|
2447 | vn_fill_junk(kif); |
---|
2448 | ); |
---|
2449 | |
---|
2450 | /* |
---|
2451 | * Retrieve vnode attributes. |
---|
2452 | */ |
---|
2453 | va.va_fsid = VNOVAL; |
---|
2454 | va.va_rdev = NODEV; |
---|
2455 | vn_lock(vp, LK_SHARED | LK_RETRY); |
---|
2456 | error = VOP_GETATTR(vp, &va, curthread->td_ucred); |
---|
2457 | VOP_UNLOCK(vp, 0); |
---|
2458 | if (error != 0) |
---|
2459 | return (error); |
---|
2460 | if (va.va_fsid != VNOVAL) |
---|
2461 | kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; |
---|
2462 | else |
---|
2463 | kif->kf_un.kf_file.kf_file_fsid = |
---|
2464 | vp->v_mount->mnt_stat.f_fsid.val[0]; |
---|
2465 | kif->kf_un.kf_file.kf_file_fsid_freebsd11 = |
---|
2466 | kif->kf_un.kf_file.kf_file_fsid; /* truncate */ |
---|
2467 | kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; |
---|
2468 | kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); |
---|
2469 | kif->kf_un.kf_file.kf_file_size = va.va_size; |
---|
2470 | kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; |
---|
2471 | kif->kf_un.kf_file.kf_file_rdev_freebsd11 = |
---|
2472 | kif->kf_un.kf_file.kf_file_rdev; /* truncate */ |
---|
2473 | return (0); |
---|
2474 | } |
---|
2475 | |
---|
2476 | #ifndef __rtems__ |
---|
2477 | int |
---|
2478 | vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, |
---|
2479 | vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, |
---|
2480 | struct thread *td) |
---|
2481 | { |
---|
2482 | #ifdef HWPMC_HOOKS |
---|
2483 | struct pmckern_map_in pkm; |
---|
2484 | #endif |
---|
2485 | struct mount *mp; |
---|
2486 | struct vnode *vp; |
---|
2487 | vm_object_t object; |
---|
2488 | vm_prot_t maxprot; |
---|
2489 | boolean_t writecounted; |
---|
2490 | int error; |
---|
2491 | |
---|
2492 | #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ |
---|
2493 | defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) |
---|
2494 | /* |
---|
2495 | * POSIX shared-memory objects are defined to have |
---|
2496 | * kernel persistence, and are not defined to support |
---|
2497 | * read(2)/write(2) -- or even open(2). Thus, we can |
---|
2498 | * use MAP_ASYNC to trade on-disk coherence for speed. |
---|
2499 | * The shm_open(3) library routine turns on the FPOSIXSHM |
---|
2500 | * flag to request this behavior. |
---|
2501 | */ |
---|
2502 | if ((fp->f_flag & FPOSIXSHM) != 0) |
---|
2503 | flags |= MAP_NOSYNC; |
---|
2504 | #endif |
---|
2505 | vp = fp->f_vnode; |
---|
2506 | |
---|
2507 | /* |
---|
2508 | * Ensure that file and memory protections are |
---|
2509 | * compatible. Note that we only worry about |
---|
2510 | * writability if mapping is shared; in this case, |
---|
2511 | * current and max prot are dictated by the open file. |
---|
2512 | * XXX use the vnode instead? Problem is: what |
---|
2513 | * credentials do we use for determination? What if |
---|
2514 | * proc does a setuid? |
---|
2515 | */ |
---|
2516 | mp = vp->v_mount; |
---|
2517 | if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { |
---|
2518 | maxprot = VM_PROT_NONE; |
---|
2519 | if ((prot & VM_PROT_EXECUTE) != 0) |
---|
2520 | return (EACCES); |
---|
2521 | } else |
---|
2522 | maxprot = VM_PROT_EXECUTE; |
---|
2523 | if ((fp->f_flag & FREAD) != 0) |
---|
2524 | maxprot |= VM_PROT_READ; |
---|
2525 | else if ((prot & VM_PROT_READ) != 0) |
---|
2526 | return (EACCES); |
---|
2527 | |
---|
2528 | /* |
---|
2529 | * If we are sharing potential changes via MAP_SHARED and we |
---|
2530 | * are trying to get write permission although we opened it |
---|
2531 | * without asking for it, bail out. |
---|
2532 | */ |
---|
2533 | if ((flags & MAP_SHARED) != 0) { |
---|
2534 | if ((fp->f_flag & FWRITE) != 0) |
---|
2535 | maxprot |= VM_PROT_WRITE; |
---|
2536 | else if ((prot & VM_PROT_WRITE) != 0) |
---|
2537 | return (EACCES); |
---|
2538 | } else { |
---|
2539 | maxprot |= VM_PROT_WRITE; |
---|
2540 | cap_maxprot |= VM_PROT_WRITE; |
---|
2541 | } |
---|
2542 | maxprot &= cap_maxprot; |
---|
2543 | |
---|
2544 | /* |
---|
2545 | * For regular files and shared memory, POSIX requires that |
---|
2546 | * the value of foff be a legitimate offset within the data |
---|
2547 | * object. In particular, negative offsets are invalid. |
---|
2548 | * Blocking negative offsets and overflows here avoids |
---|
2549 | * possible wraparound or user-level access into reserved |
---|
2550 | * ranges of the data object later. In contrast, POSIX does |
---|
2551 | * not dictate how offsets are used by device drivers, so in |
---|
2552 | * the case of a device mapping a negative offset is passed |
---|
2553 | * on. |
---|
2554 | */ |
---|
2555 | if ( |
---|
2556 | #ifdef _LP64 |
---|
2557 | size > OFF_MAX || |
---|
2558 | #endif |
---|
2559 | foff < 0 || foff > OFF_MAX - size) |
---|
2560 | return (EINVAL); |
---|
2561 | |
---|
2562 | writecounted = FALSE; |
---|
2563 | error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, |
---|
2564 | &foff, &object, &writecounted); |
---|
2565 | if (error != 0) |
---|
2566 | return (error); |
---|
2567 | error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, |
---|
2568 | foff, writecounted, td); |
---|
2569 | if (error != 0) { |
---|
2570 | /* |
---|
2571 | * If this mapping was accounted for in the vnode's |
---|
2572 | * writecount, then undo that now. |
---|
2573 | */ |
---|
2574 | if (writecounted) |
---|
2575 | vm_pager_release_writecount(object, 0, size); |
---|
2576 | vm_object_deallocate(object); |
---|
2577 | } |
---|
2578 | #ifdef HWPMC_HOOKS |
---|
2579 | /* Inform hwpmc(4) if an executable is being mapped. */ |
---|
2580 | if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { |
---|
2581 | if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { |
---|
2582 | pkm.pm_file = vp; |
---|
2583 | pkm.pm_address = (uintptr_t) *addr; |
---|
2584 | PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); |
---|
2585 | } |
---|
2586 | } |
---|
2587 | #endif |
---|
2588 | return (error); |
---|
2589 | } |
---|
2590 | #endif /* __rtems__ */ |
---|
2591 | |
---|
2592 | void |
---|
2593 | vn_fsid(struct vnode *vp, struct vattr *va) |
---|
2594 | { |
---|
2595 | fsid_t *f; |
---|
2596 | |
---|
2597 | f = &vp->v_mount->mnt_stat.f_fsid; |
---|
2598 | va->va_fsid = (uint32_t)f->val[1]; |
---|
2599 | va->va_fsid <<= sizeof(f->val[1]) * NBBY; |
---|
2600 | va->va_fsid += (uint32_t)f->val[0]; |
---|
2601 | } |
---|
2602 | |
---|
2603 | int |
---|
2604 | vn_fsync_buf(struct vnode *vp, int waitfor) |
---|
2605 | { |
---|
2606 | struct buf *bp, *nbp; |
---|
2607 | struct bufobj *bo; |
---|
2608 | struct mount *mp; |
---|
2609 | int error, maxretry; |
---|
2610 | |
---|
2611 | error = 0; |
---|
2612 | maxretry = 10000; /* large, arbitrarily chosen */ |
---|
2613 | mp = NULL; |
---|
2614 | if (vp->v_type == VCHR) { |
---|
2615 | VI_LOCK(vp); |
---|
2616 | mp = vp->v_rdev->si_mountpt; |
---|
2617 | VI_UNLOCK(vp); |
---|
2618 | } |
---|
2619 | bo = &vp->v_bufobj; |
---|
2620 | BO_LOCK(bo); |
---|
2621 | loop1: |
---|
2622 | /* |
---|
2623 | * MARK/SCAN initialization to avoid infinite loops. |
---|
2624 | */ |
---|
2625 | TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { |
---|
2626 | bp->b_vflags &= ~BV_SCANNED; |
---|
2627 | bp->b_error = 0; |
---|
2628 | } |
---|
2629 | |
---|
2630 | /* |
---|
2631 | * Flush all dirty buffers associated with a vnode. |
---|
2632 | */ |
---|
2633 | loop2: |
---|
2634 | TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { |
---|
2635 | if ((bp->b_vflags & BV_SCANNED) != 0) |
---|
2636 | continue; |
---|
2637 | bp->b_vflags |= BV_SCANNED; |
---|
2638 | if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { |
---|
2639 | if (waitfor != MNT_WAIT) |
---|
2640 | continue; |
---|
2641 | if (BUF_LOCK(bp, |
---|
2642 | LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, |
---|
2643 | BO_LOCKPTR(bo)) != 0) { |
---|
2644 | BO_LOCK(bo); |
---|
2645 | goto loop1; |
---|
2646 | } |
---|
2647 | BO_LOCK(bo); |
---|
2648 | } |
---|
2649 | BO_UNLOCK(bo); |
---|
2650 | KASSERT(bp->b_bufobj == bo, |
---|
2651 | ("bp %p wrong b_bufobj %p should be %p", |
---|
2652 | bp, bp->b_bufobj, bo)); |
---|
2653 | if ((bp->b_flags & B_DELWRI) == 0) |
---|
2654 | panic("fsync: not dirty"); |
---|
2655 | if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { |
---|
2656 | vfs_bio_awrite(bp); |
---|
2657 | } else { |
---|
2658 | bremfree(bp); |
---|
2659 | bawrite(bp); |
---|
2660 | } |
---|
2661 | if (maxretry < 1000) |
---|
2662 | pause("dirty", hz < 1000 ? 1 : hz / 1000); |
---|
2663 | BO_LOCK(bo); |
---|
2664 | goto loop2; |
---|
2665 | } |
---|
2666 | |
---|
2667 | /* |
---|
2668 | * If synchronous the caller expects us to completely resolve all |
---|
2669 | * dirty buffers in the system. Wait for in-progress I/O to |
---|
2670 | * complete (which could include background bitmap writes), then |
---|
2671 | * retry if dirty blocks still exist. |
---|
2672 | */ |
---|
2673 | if (waitfor == MNT_WAIT) { |
---|
2674 | bufobj_wwait(bo, 0, 0); |
---|
2675 | if (bo->bo_dirty.bv_cnt > 0) { |
---|
2676 | /* |
---|
2677 | * If we are unable to write any of these buffers |
---|
2678 | * then we fail now rather than trying endlessly |
---|
2679 | * to write them out. |
---|
2680 | */ |
---|
2681 | TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) |
---|
2682 | if ((error = bp->b_error) != 0) |
---|
2683 | break; |
---|
2684 | if ((mp != NULL && mp->mnt_secondary_writes > 0) || |
---|
2685 | (error == 0 && --maxretry >= 0)) |
---|
2686 | goto loop1; |
---|
2687 | if (error == 0) |
---|
2688 | error = EAGAIN; |
---|
2689 | } |
---|
2690 | } |
---|
2691 | BO_UNLOCK(bo); |
---|
2692 | if (error != 0) |
---|
2693 | vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); |
---|
2694 | |
---|
2695 | return (error); |
---|
2696 | } |
---|