Context Navigation

vfs_vnops.c @ 6514d56

6-freebsd-12

Last change on this file since 6514d56 was 6514d56, checked in by Chris Johns <chrisj@…>, on 08/02/21 at 05:09:41

sys/kern: Add VFS support

Refactor the libio interface

Move syscalls into an rtemsbsd location

Provide a root directory mount point

Update #4475

Property mode set to 100644

File size: 66.0 KB

Line
1	#include <machine/rtems-bsd-kernel-space.h>
2
3	/*-
4	* SPDX-License-Identifier: BSD-3-Clause
5	*
6	* Copyright (c) 1982, 1986, 1989, 1993
7	* The Regents of the University of California. All rights reserved.
8	* (c) UNIX System Laboratories, Inc.
9	* All or some portions of this file are derived from material licensed
10	* to the University of California by American Telephone and Telegraph
11	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
12	* the permission of UNIX System Laboratories, Inc.
13	*
14	* Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
15	* Copyright (c) 2013, 2014 The FreeBSD Foundation
16	*
17	* Portions of this software were developed by Konstantin Belousov
18	* under sponsorship from the FreeBSD Foundation.
19	*
20	* Redistribution and use in source and binary forms, with or without
21	* modification, are permitted provided that the following conditions
22	* are met:
23	* 1. Redistributions of source code must retain the above copyright
24	* notice, this list of conditions and the following disclaimer.
25	* 2. Redistributions in binary form must reproduce the above copyright
26	* notice, this list of conditions and the following disclaimer in the
27	* documentation and/or other materials provided with the distribution.
28	* 3. Neither the name of the University nor the names of its contributors
29	* may be used to endorse or promote products derived from this software
30	* without specific prior written permission.
31	*
32	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42	* SUCH DAMAGE.
43	*
44	* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
45	*/
46
47	#include <sys/cdefs.h>
48	__FBSDID("$FreeBSD$");
49
50	#include <rtems/bsd/local/opt_hwpmc_hooks.h>
51
52	#include <sys/param.h>
53	#include <sys/systm.h>
54	#include <sys/disk.h>
55	#include <sys/fail.h>
56	#include <sys/fcntl.h>
57	#include <sys/file.h>
58	#include <sys/kdb.h>
59	#include <sys/stat.h>
60	#include <sys/priv.h>
61	#include <sys/proc.h>
62	#include <sys/limits.h>
63	#include <sys/lock.h>
64	#include <sys/mman.h>
65	#include <sys/mount.h>
66	#include <sys/mutex.h>
67	#include <sys/namei.h>
68	#include <sys/vnode.h>
69	#include <sys/bio.h>
70	#include <sys/buf.h>
71	#include <sys/filio.h>
72	#include <sys/resourcevar.h>
73	#include <sys/rwlock.h>
74	#include <sys/sx.h>
75	#include <sys/sysctl.h>
76	#include <sys/ttycom.h>
77	#include <sys/conf.h>
78	#include <sys/syslog.h>
79	#include <rtems/bsd/sys/unistd.h>
80	#include <sys/user.h>
81
82	#include <security/audit/audit.h>
83	#include <security/mac/mac_framework.h>
84
85	#include <vm/vm.h>
86	#include <vm/vm_extern.h>
87	#include <vm/pmap.h>
88	#include <vm/vm_map.h>
89	#include <vm/vm_object.h>
90	#include <vm/vm_page.h>
91	#include <vm/vm_pager.h>
92
93	#ifdef HWPMC_HOOKS
94	#include <sys/pmckern.h>
95	#endif
96
97	static fo_rdwr_t vn_read;
98	static fo_rdwr_t vn_write;
99	static fo_rdwr_t vn_io_fault;
100	static fo_truncate_t vn_truncate;
101	static fo_ioctl_t vn_ioctl;
102	static fo_poll_t vn_poll;
103	static fo_kqfilter_t vn_kqfilter;
104	static fo_stat_t vn_statfile;
105	static fo_close_t vn_closefile;
106	static fo_mmap_t vn_mmap;
107
108	struct fileops vnops = {
109	.fo_read = vn_io_fault,
110	.fo_write = vn_io_fault,
111	.fo_truncate = vn_truncate,
112	.fo_ioctl = vn_ioctl,
113	.fo_poll = vn_poll,
114	.fo_kqfilter = vn_kqfilter,
115	.fo_stat = vn_statfile,
116	.fo_close = vn_closefile,
117	.fo_chmod = vn_chmod,
118	.fo_chown = vn_chown,
119	#ifndef __rtems__
120	.fo_sendfile = vn_sendfile,
121	#endif /* __rtems__ */
122	.fo_seek = vn_seek,
123	#ifndef __rtems__
124	.fo_fill_kinfo = vn_fill_kinfo,
125	.fo_mmap = vn_mmap,
126	#endif /* __rtems__ */
127	.fo_flags = DFLAG_PASSABLE \| DFLAG_SEEKABLE
128	};
129
130	static const int io_hold_cnt = 16;
131	static int vn_io_fault_enable = 1;
132	SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
133	&vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
134	static int vn_io_fault_prefault = 0;
135	SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
136	&vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
137	static u_long vn_io_faults_cnt;
138	SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
139	&vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
140
141	/*
142	* Returns true if vn_io_fault mode of handling the i/o request should
143	* be used.
144	*/
145	static bool
146	do_vn_io_fault(struct vnode vp, struct uio uio)
147	{
148	struct mount *mp;
149
150	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
151	(mp = vp->v_mount) != NULL &&
152	(mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
153	}
154
155	/*
156	* Structure used to pass arguments to vn_io_fault1(), to do either
157	* file- or vnode-based I/O calls.
158	*/
159	struct vn_io_fault_args {
160	enum {
161	VN_IO_FAULT_FOP,
162	VN_IO_FAULT_VOP
163	} kind;
164	struct ucred *cred;
165	int flags;
166	union {
167	struct fop_args_tag {
168	struct file *fp;
169	fo_rdwr_t *doio;
170	} fop_args;
171	struct vop_args_tag {
172	struct vnode *vp;
173	} vop_args;
174	} args;
175	};
176
177	static int vn_io_fault1(struct vnode vp, struct uio uio,
178	struct vn_io_fault_args args, struct thread td);
179
180	int
181	vn_open(struct nameidata ndp, int flagp, int cmode, struct file *fp)
182	{
183	struct thread *td = ndp->ni_cnd.cn_thread;
184
185	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
186	}
187
188	/*
189	* Common code for vnode open operations via a name lookup.
190	* Lookup the vnode and invoke VOP_CREATE if needed.
191	* Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
192	*
193	* Note that this does NOT free nameidata for the successful case,
194	* due to the NDINIT being done elsewhere.
195	*/
196	int
197	vn_open_cred(struct nameidata ndp, int flagp, int cmode, u_int vn_open_flags,
198	struct ucred cred, struct file fp)
199	{
200	struct vnode *vp;
201	struct mount *mp;
202	struct thread *td = ndp->ni_cnd.cn_thread;
203	struct vattr vat;
204	struct vattr *vap = &vat;
205	int fmode, error;
206
207	restart:
208	fmode = *flagp;
209	if ((fmode & (O_CREAT \| O_EXCL \| O_DIRECTORY)) == (O_CREAT \|
210	O_EXCL \| O_DIRECTORY))
211	return (EINVAL);
212	else if ((fmode & (O_CREAT \| O_DIRECTORY)) == O_CREAT) {
213	ndp->ni_cnd.cn_nameiop = CREATE;
214	/*
215	* Set NOCACHE to avoid flushing the cache when
216	* rolling in many files at once.
217	*/
218	ndp->ni_cnd.cn_flags = ISOPEN \| LOCKPARENT \| LOCKLEAF \| NOCACHE;
219	if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
220	ndp->ni_cnd.cn_flags \|= FOLLOW;
221	if (!(vn_open_flags & VN_OPEN_NOAUDIT))
222	ndp->ni_cnd.cn_flags \|= AUDITVNODE1;
223	if (vn_open_flags & VN_OPEN_NOCAPCHECK)
224	ndp->ni_cnd.cn_flags \|= NOCAPCHECK;
225	if ((vn_open_flags & VN_OPEN_INVFS) == 0)
226	bwillwrite();
227	if ((error = namei(ndp)) != 0)
228	return (error);
229	if (ndp->ni_vp == NULL) {
230	VATTR_NULL(vap);
231	vap->va_type = VREG;
232	vap->va_mode = cmode;
233	if (fmode & O_EXCL)
234	vap->va_vaflags \|= VA_EXCLUSIVE;
235	if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
236	NDFREE(ndp, NDF_ONLY_PNBUF);
237	vput(ndp->ni_dvp);
238	if ((error = vn_start_write(NULL, &mp,
239	V_XSLEEP \| PCATCH)) != 0)
240	return (error);
241	goto restart;
242	}
243	if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
244	ndp->ni_cnd.cn_flags \|= MAKEENTRY;
245	#ifdef MAC
246	error = mac_vnode_check_create(cred, ndp->ni_dvp,
247	&ndp->ni_cnd, vap);
248	if (error == 0)
249	#endif
250	error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
251	&ndp->ni_cnd, vap);
252	vput(ndp->ni_dvp);
253	vn_finished_write(mp);
254	if (error) {
255	NDFREE(ndp, NDF_ONLY_PNBUF);
256	return (error);
257	}
258	fmode &= ~O_TRUNC;
259	vp = ndp->ni_vp;
260	} else {
261	if (ndp->ni_dvp == ndp->ni_vp)
262	vrele(ndp->ni_dvp);
263	else
264	vput(ndp->ni_dvp);
265	ndp->ni_dvp = NULL;
266	vp = ndp->ni_vp;
267	if (fmode & O_EXCL) {
268	error = EEXIST;
269	goto bad;
270	}
271	if (vp->v_type == VDIR) {
272	error = EISDIR;
273	goto bad;
274	}
275	fmode &= ~O_CREAT;
276	}
277	} else {
278	ndp->ni_cnd.cn_nameiop = LOOKUP;
279	ndp->ni_cnd.cn_flags = ISOPEN \|
280	((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) \| LOCKLEAF;
281	if (!(fmode & FWRITE))
282	ndp->ni_cnd.cn_flags \|= LOCKSHARED;
283	if (!(vn_open_flags & VN_OPEN_NOAUDIT))
284	ndp->ni_cnd.cn_flags \|= AUDITVNODE1;
285	if (vn_open_flags & VN_OPEN_NOCAPCHECK)
286	ndp->ni_cnd.cn_flags \|= NOCAPCHECK;
287	if ((error = namei(ndp)) != 0)
288	return (error);
289	vp = ndp->ni_vp;
290	}
291	error = vn_open_vnode(vp, fmode, cred, td, fp);
292	if (error)
293	goto bad;
294	*flagp = fmode;
295	return (0);
296	bad:
297	NDFREE(ndp, NDF_ONLY_PNBUF);
298	vput(vp);
299	*flagp = fmode;
300	ndp->ni_vp = NULL;
301	return (error);
302	}
303
304	static int
305	vn_open_vnode_advlock(struct vnode vp, int fmode, struct file fp)
306	{
307	struct flock lf;
308	int error, lock_flags, type;
309
310	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
311	if ((fmode & (O_EXLOCK \| O_SHLOCK)) == 0)
312	return (0);
313	KASSERT(fp != NULL, ("open with flock requires fp"));
314	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
315	return (EOPNOTSUPP);
316
317	lock_flags = VOP_ISLOCKED(vp);
318	VOP_UNLOCK(vp, 0);
319
320	lf.l_whence = SEEK_SET;
321	lf.l_start = 0;
322	lf.l_len = 0;
323	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
324	type = F_FLOCK;
325	if ((fmode & FNONBLOCK) == 0)
326	type \|= F_WAIT;
327	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
328	if (error == 0)
329	fp->f_flag \|= FHASLOCK;
330
331	vn_lock(vp, lock_flags \| LK_RETRY);
332	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
333	error = ENOENT;
334	return (error);
335	}
336
337	/*
338	* Common code for vnode open operations once a vnode is located.
339	* Check permissions, and call the VOP_OPEN routine.
340	*/
341	int
342	vn_open_vnode(struct vnode vp, int fmode, struct ucred cred,
343	struct thread td, struct file fp)
344	{
345	accmode_t accmode;
346	int error;
347
348	if (vp->v_type == VLNK)
349	return (EMLINK);
350	if (vp->v_type == VSOCK)
351	return (EOPNOTSUPP);
352	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
353	return (ENOTDIR);
354	accmode = 0;
355	if (fmode & (FWRITE \| O_TRUNC)) {
356	if (vp->v_type == VDIR)
357	return (EISDIR);
358	accmode \|= VWRITE;
359	}
360	if (fmode & FREAD)
361	accmode \|= VREAD;
362	#ifndef __rtems__
363	if (fmode & FEXEC)
364	accmode \|= VEXEC;
365	#endif /* __rtems__ */
366	if ((fmode & O_APPEND) && (fmode & FWRITE))
367	accmode \|= VAPPEND;
368	#ifdef MAC
369	if (fmode & O_CREAT)
370	accmode \|= VCREAT;
371	if (fmode & O_VERIFY)
372	accmode \|= VVERIFY;
373	error = mac_vnode_check_open(cred, vp, accmode);
374	if (error)
375	return (error);
376
377	accmode &= ~(VCREAT \| VVERIFY);
378	#endif
379	if ((fmode & O_CREAT) == 0 && accmode != 0) {
380	error = VOP_ACCESS(vp, accmode, cred, td);
381	if (error != 0)
382	return (error);
383	}
384	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
385	vn_lock(vp, LK_UPGRADE \| LK_RETRY);
386	error = VOP_OPEN(vp, fmode, cred, td, fp);
387	if (error != 0)
388	return (error);
389
390	error = vn_open_vnode_advlock(vp, fmode, fp);
391	if (error == 0 && (fmode & FWRITE) != 0) {
392	error = VOP_ADD_WRITECOUNT(vp, 1);
393	if (error == 0) {
394	CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
395	__func__, vp, vp->v_writecount);
396	}
397	}
398
399	/*
400	* Error from advlock or VOP_ADD_WRITECOUNT() still requires
401	* calling VOP_CLOSE() to pair with earlier VOP_OPEN().
402	* Arrange for that by having fdrop() to use vn_closefile().
403	*/
404	if (error != 0) {
405	#ifndef __rtems__
406	fp->f_flag \|= FOPENFAILED;
407	#endif /* __rtems__ */
408	fp->f_vnode = vp;
409	if (fp->f_ops == &badfileops) {
410	fp->f_type = DTYPE_VNODE;
411	fp->f_ops = &vnops;
412	}
413	vref(vp);
414	}
415
416	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
417	return (error);
418
419	}
420
421	/*
422	* Check for write permissions on the specified vnode.
423	* Prototype text segments cannot be written.
424	* It is racy.
425	*/
426	int
427	vn_writechk(struct vnode *vp)
428	{
429
430	ASSERT_VOP_LOCKED(vp, "vn_writechk");
431	/*
432	* If there's shared text associated with
433	* the vnode, try to free it up once. If
434	* we fail, we can't allow writing.
435	*/
436	if (VOP_IS_TEXT(vp))
437	return (ETXTBSY);
438
439	return (0);
440	}
441
442	/*
443	* Vnode close call
444	*/
445	static int
446	vn_close1(struct vnode vp, int flags, struct ucred file_cred,
447	struct thread *td, bool keep_ref)
448	{
449	struct mount *mp;
450	int error, lock_flags;
451
452	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
453	MNT_EXTENDED_SHARED(vp->v_mount))
454	lock_flags = LK_SHARED;
455	else
456	lock_flags = LK_EXCLUSIVE;
457
458	vn_start_write(vp, &mp, V_WAIT);
459	vn_lock(vp, lock_flags \| LK_RETRY);
460	AUDIT_ARG_VNODE1(vp);
461	#ifndef __rtems__
462	if ((flags & (FWRITE \| FOPENFAILED)) == FWRITE) {
463	#else /* __rtems__ */
464	if ((flags & FWRITE) == FWRITE) {
465	#endif /* __rtems__ */
466	VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
467	CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
468	__func__, vp, vp->v_writecount);
469	}
470	error = VOP_CLOSE(vp, flags, file_cred, td);
471	if (keep_ref)
472	VOP_UNLOCK(vp, 0);
473	else
474	vput(vp);
475	vn_finished_write(mp);
476	return (error);
477	}
478
479	int
480	vn_close(struct vnode vp, int flags, struct ucred file_cred,
481	struct thread *td)
482	{
483
484	return (vn_close1(vp, flags, file_cred, td, false));
485	}
486
487	/*
488	* Heuristic to detect sequential operation.
489	*/
490	static int
491	sequential_heuristic(struct uio uio, struct file fp)
492	{
493
494	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
495	#ifndef __rtems__
496	if (fp->f_flag & FRDAHEAD)
497	return (fp->f_seqcount << IO_SEQSHIFT);
498	#endif /* __rtems__ */
499
500	/*
501	* Offset 0 is handled specially. open() sets f_seqcount to 1 so
502	* that the first I/O is normally considered to be slightly
503	* sequential. Seeking to offset 0 doesn't change sequentiality
504	* unless previous seeks have reduced f_seqcount to 0, in which
505	* case offset 0 is not special.
506	*/
507	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) \|\|
508	uio->uio_offset == fp->f_nextoff) {
509	/*
510	* f_seqcount is in units of fixed-size blocks so that it
511	* depends mainly on the amount of sequential I/O and not
512	* much on the number of sequential I/O's. The fixed size
513	* of 16384 is hard-coded here since it is (not quite) just
514	* a magic size that works well here. This size is more
515	* closely related to the best I/O size for real disks than
516	* to any block size used by software.
517	*/
518	if (uio->uio_resid >= IO_SEQMAX * 16384)
519	fp->f_seqcount = IO_SEQMAX;
520	else {
521	fp->f_seqcount += howmany(uio->uio_resid, 16384);
522	if (fp->f_seqcount > IO_SEQMAX)
523	fp->f_seqcount = IO_SEQMAX;
524	}
525	return (fp->f_seqcount << IO_SEQSHIFT);
526	}
527
528	/* Not sequential. Quickly draw-down sequentiality. */
529	if (fp->f_seqcount > 1)
530	fp->f_seqcount = 1;
531	else
532	fp->f_seqcount = 0;
533	return (0);
534	}
535
536	/*
537	* Package up an I/O request on a vnode into a uio and do it.
538	*/
539	int
540	vn_rdwr(enum uio_rw rw, struct vnode vp, void base, int len, off_t offset,
541	enum uio_seg segflg, int ioflg, struct ucred *active_cred,
542	struct ucred file_cred, ssize_t aresid, struct thread *td)
543	{
544	struct uio auio;
545	struct iovec aiov;
546	struct mount *mp;
547	struct ucred *cred;
548	void *rl_cookie;
549	struct vn_io_fault_args args;
550	int error, lock_flags;
551
552	if (offset < 0 && vp->v_type != VCHR)
553	return (EINVAL);
554	auio.uio_iov = &aiov;
555	auio.uio_iovcnt = 1;
556	aiov.iov_base = base;
557	aiov.iov_len = len;
558	auio.uio_resid = len;
559	auio.uio_offset = offset;
560	auio.uio_segflg = segflg;
561	auio.uio_rw = rw;
562	auio.uio_td = td;
563	error = 0;
564
565	if ((ioflg & IO_NODELOCKED) == 0) {
566	if ((ioflg & IO_RANGELOCKED) == 0) {
567	if (rw == UIO_READ) {
568	rl_cookie = vn_rangelock_rlock(vp, offset,
569	offset + len);
570	} else {
571	rl_cookie = vn_rangelock_wlock(vp, offset,
572	offset + len);
573	}
574	} else
575	rl_cookie = NULL;
576	mp = NULL;
577	if (rw == UIO_WRITE) {
578	if (vp->v_type != VCHR &&
579	(error = vn_start_write(vp, &mp, V_WAIT \| PCATCH))
580	!= 0)
581	goto out;
582	if (MNT_SHARED_WRITES(mp) \|\|
583	((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
584	lock_flags = LK_SHARED;
585	else
586	lock_flags = LK_EXCLUSIVE;
587	} else
588	lock_flags = LK_SHARED;
589	vn_lock(vp, lock_flags \| LK_RETRY);
590	} else
591	rl_cookie = NULL;
592
593	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
594	#ifdef MAC
595	if ((ioflg & IO_NOMACCHECK) == 0) {
596	if (rw == UIO_READ)
597	error = mac_vnode_check_read(active_cred, file_cred,
598	vp);
599	else
600	error = mac_vnode_check_write(active_cred, file_cred,
601	vp);
602	}
603	#endif
604	if (error == 0) {
605	if (file_cred != NULL)
606	cred = file_cred;
607	else
608	cred = active_cred;
609	if (do_vn_io_fault(vp, &auio)) {
610	args.kind = VN_IO_FAULT_VOP;
611	args.cred = cred;
612	args.flags = ioflg;
613	args.args.vop_args.vp = vp;
614	error = vn_io_fault1(vp, &auio, &args, td);
615	} else if (rw == UIO_READ) {
616	error = VOP_READ(vp, &auio, ioflg, cred);
617	} else /* if (rw == UIO_WRITE) */ {
618	error = VOP_WRITE(vp, &auio, ioflg, cred);
619	}
620	}
621	if (aresid)
622	*aresid = auio.uio_resid;
623	else
624	if (auio.uio_resid && error == 0)
625	error = EIO;
626	if ((ioflg & IO_NODELOCKED) == 0) {
627	VOP_UNLOCK(vp, 0);
628	if (mp != NULL)
629	vn_finished_write(mp);
630	}
631	out:
632	if (rl_cookie != NULL)
633	vn_rangelock_unlock(vp, rl_cookie);
634	return (error);
635	}
636
637	/*
638	* Package up an I/O request on a vnode into a uio and do it. The I/O
639	* request is split up into smaller chunks and we try to avoid saturating
640	* the buffer cache while potentially holding a vnode locked, so we
641	* check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
642	* to give other processes a chance to lock the vnode (either other processes
643	* core'ing the same binary, or unrelated processes scanning the directory).
644	*/
645	int
646	vn_rdwr_inchunks(enum uio_rw rw, struct vnode vp, void base, size_t len,
647	off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
648	struct ucred file_cred, size_t aresid, struct thread *td)
649	{
650	int error = 0;
651	ssize_t iaresid;
652
653	do {
654	int chunk;
655
656	/*
657	* Force `offset' to a multiple of MAXBSIZE except possibly
658	* for the first chunk, so that filesystems only need to
659	* write full blocks except possibly for the first and last
660	* chunks.
661	*/
662	chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
663
664	if (chunk > len)
665	chunk = len;
666	if (rw != UIO_READ && vp->v_type == VREG)
667	bwillwrite();
668	iaresid = 0;
669	error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
670	ioflg, active_cred, file_cred, &iaresid, td);
671	len -= chunk; /* aresid calc already includes length */
672	if (error)
673	break;
674	offset += chunk;
675	base = (char *)base + chunk;
676	kern_yield(PRI_USER);
677	} while (len);
678	if (aresid)
679	*aresid = len + iaresid;
680	return (error);
681	}
682
683	off_t
684	foffset_lock(struct file *fp, int flags)
685	{
686	struct mtx *mtxp;
687	off_t res;
688
689	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
690
691	#if OFF_MAX <= LONG_MAX
692	/*
693	* Caller only wants the current f_offset value. Assume that
694	* the long and shorter integer types reads are atomic.
695	*/
696	if ((flags & FOF_NOLOCK) != 0)
697	return (fp->f_offset);
698	#endif
699
700	/*
701	* According to McKusick the vn lock was protecting f_offset here.
702	* It is now protected by the FOFFSET_LOCKED flag.
703	*/
704	mtxp = mtx_pool_find(mtxpool_sleep, fp);
705	mtx_lock(mtxp);
706	if ((flags & FOF_NOLOCK) == 0) {
707	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
708	fp->f_vnread_flags \|= FOFFSET_LOCK_WAITING;
709	msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
710	"vofflock", 0);
711	}
712	fp->f_vnread_flags \|= FOFFSET_LOCKED;
713	}
714	res = fp->f_offset;
715	mtx_unlock(mtxp);
716	return (res);
717	}
718
719	void
720	foffset_unlock(struct file *fp, off_t val, int flags)
721	{
722	struct mtx *mtxp;
723
724	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
725
726	#if OFF_MAX <= LONG_MAX
727	if ((flags & FOF_NOLOCK) != 0) {
728	if ((flags & FOF_NOUPDATE) == 0)
729	fp->f_offset = val;
730	if ((flags & FOF_NEXTOFF) != 0)
731	fp->f_nextoff = val;
732	return;
733	}
734	#endif
735
736	mtxp = mtx_pool_find(mtxpool_sleep, fp);
737	mtx_lock(mtxp);
738	if ((flags & FOF_NOUPDATE) == 0)
739	fp->f_offset = val;
740	if ((flags & FOF_NEXTOFF) != 0)
741	fp->f_nextoff = val;
742	if ((flags & FOF_NOLOCK) == 0) {
743	KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
744	("Lost FOFFSET_LOCKED"));
745	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
746	wakeup(&fp->f_vnread_flags);
747	fp->f_vnread_flags = 0;
748	}
749	mtx_unlock(mtxp);
750	}
751
752	void
753	foffset_lock_uio(struct file fp, struct uio uio, int flags)
754	{
755
756	if ((flags & FOF_OFFSET) == 0)
757	uio->uio_offset = foffset_lock(fp, flags);
758	}
759
760	void
761	foffset_unlock_uio(struct file fp, struct uio uio, int flags)
762	{
763
764	if ((flags & FOF_OFFSET) == 0)
765	foffset_unlock(fp, uio->uio_offset, flags);
766	}
767
768	static int
769	get_advice(struct file fp, struct uio uio)
770	{
771	#ifndef __rtems__
772	struct mtx *mtxp;
773	int ret;
774
775	ret = POSIX_FADV_NORMAL;
776	if (fp->f_advice == NULL \|\| fp->f_vnode->v_type != VREG)
777	return (ret);
778
779	mtxp = mtx_pool_find(mtxpool_sleep, fp);
780	mtx_lock(mtxp);
781	if (fp->f_advice != NULL &&
782	uio->uio_offset >= fp->f_advice->fa_start &&
783	uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
784	ret = fp->f_advice->fa_advice;
785	mtx_unlock(mtxp);
786	return (ret);
787	#else /* __rtems__ */
788	return (POSIX_FADV_NORMAL);
789	#endif /* __rtems__ */
790	}
791
792	/*
793	* File table vnode read routine.
794	*/
795	static int
796	vn_read(struct file fp, struct uio uio, struct ucred *active_cred, int flags,
797	struct thread *td)
798	{
799	struct vnode *vp;
800	off_t orig_offset;
801	int error, ioflag;
802	int advice;
803
804	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
805	uio->uio_td, td));
806	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
807	vp = fp->f_vnode;
808	ioflag = 0;
809	if (fp->f_flag & FNONBLOCK)
810	ioflag \|= IO_NDELAY;
811	if (fp->f_flag & O_DIRECT)
812	ioflag \|= IO_DIRECT;
813	advice = get_advice(fp, uio);
814	vn_lock(vp, LK_SHARED \| LK_RETRY);
815
816	switch (advice) {
817	case POSIX_FADV_NORMAL:
818	case POSIX_FADV_SEQUENTIAL:
819	case POSIX_FADV_NOREUSE:
820	ioflag \|= sequential_heuristic(uio, fp);
821	break;
822	case POSIX_FADV_RANDOM:
823	/* Disable read-ahead for random I/O. */
824	break;
825	}
826	orig_offset = uio->uio_offset;
827
828	#ifdef MAC
829	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
830	if (error == 0)
831	#endif
832	error = VOP_READ(vp, uio, ioflag, fp->f_cred);
833	fp->f_nextoff = uio->uio_offset;
834	VOP_UNLOCK(vp, 0);
835	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
836	orig_offset != uio->uio_offset)
837	/*
838	* Use POSIX_FADV_DONTNEED to flush pages and buffers
839	* for the backing file after a POSIX_FADV_NOREUSE
840	* read(2).
841	*/
842	error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
843	POSIX_FADV_DONTNEED);
844	return (error);
845	}
846
847	/*
848	* File table vnode write routine.
849	*/
850	static int
851	vn_write(struct file fp, struct uio uio, struct ucred *active_cred, int flags,
852	struct thread *td)
853	{
854	struct vnode *vp;
855	struct mount *mp;
856	off_t orig_offset;
857	int error, ioflag, lock_flags;
858	int advice;
859
860	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
861	uio->uio_td, td));
862	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
863	vp = fp->f_vnode;
864	if (vp->v_type == VREG)
865	bwillwrite();
866	ioflag = IO_UNIT;
867	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
868	ioflag \|= IO_APPEND;
869	if (fp->f_flag & FNONBLOCK)
870	ioflag \|= IO_NDELAY;
871	if (fp->f_flag & O_DIRECT)
872	ioflag \|= IO_DIRECT;
873	if ((fp->f_flag & O_FSYNC) \|\|
874	(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
875	ioflag \|= IO_SYNC;
876	mp = NULL;
877	if (vp->v_type != VCHR &&
878	(error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
879	goto unlock;
880
881	advice = get_advice(fp, uio);
882
883	if (MNT_SHARED_WRITES(mp) \|\|
884	(mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
885	lock_flags = LK_SHARED;
886	} else {
887	lock_flags = LK_EXCLUSIVE;
888	}
889
890	vn_lock(vp, lock_flags \| LK_RETRY);
891	switch (advice) {
892	case POSIX_FADV_NORMAL:
893	case POSIX_FADV_SEQUENTIAL:
894	case POSIX_FADV_NOREUSE:
895	ioflag \|= sequential_heuristic(uio, fp);
896	break;
897	case POSIX_FADV_RANDOM:
898	/* XXX: Is this correct? */
899	break;
900	}
901	orig_offset = uio->uio_offset;
902
903	#ifdef MAC
904	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
905	if (error == 0)
906	#endif
907	error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
908	fp->f_nextoff = uio->uio_offset;
909	VOP_UNLOCK(vp, 0);
910	if (vp->v_type != VCHR)
911	vn_finished_write(mp);
912	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
913	orig_offset != uio->uio_offset)
914	/*
915	* Use POSIX_FADV_DONTNEED to flush pages and buffers
916	* for the backing file after a POSIX_FADV_NOREUSE
917	* write(2).
918	*/
919	error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
920	POSIX_FADV_DONTNEED);
921	unlock:
922	return (error);
923	}
924
925	/*
926	* The vn_io_fault() is a wrapper around vn_read() and vn_write() to
927	* prevent the following deadlock:
928	*
929	* Assume that the thread A reads from the vnode vp1 into userspace
930	* buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is
931	* currently not resident, then system ends up with the call chain
932	* vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
933	* vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
934	* which establishes lock order vp1->vn_lock, then vp2->vn_lock.
935	* If, at the same time, thread B reads from vnode vp2 into buffer buf2
936	* backed by the pages of vnode vp1, and some page in buf2 is not
937	* resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
938	*
939	* To prevent the lock order reversal and deadlock, vn_io_fault() does
940	* not allow page faults to happen during VOP_READ() or VOP_WRITE().
941	* Instead, it first tries to do the whole range i/o with pagefaults
942	* disabled. If all pages in the i/o buffer are resident and mapped,
943	* VOP will succeed (ignoring the genuine filesystem errors).
944	* Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
945	* i/o in chunks, with all pages in the chunk prefaulted and held
946	* using vm_fault_quick_hold_pages().
947	*
948	* Filesystems using this deadlock avoidance scheme should use the
949	* array of the held pages from uio, saved in the curthread->td_ma,
950	* instead of doing uiomove(). A helper function
951	* vn_io_fault_uiomove() converts uiomove request into
952	* uiomove_fromphys() over td_ma array.
953	*
954	* Since vnode locks do not cover the whole i/o anymore, rangelocks
955	* make the current i/o request atomic with respect to other i/os and
956	* truncations.
957	*/
958
959	/*
960	* Decode vn_io_fault_args and perform the corresponding i/o.
961	*/
962	static int
963	vn_io_fault_doio(struct vn_io_fault_args args, struct uio uio,
964	struct thread *td)
965	{
966	int error, save;
967
968	error = 0;
969	#ifndef __rtems__
970	save = vm_fault_disable_pagefaults();
971	#endif /* __rtems__ */
972	switch (args->kind) {
973	case VN_IO_FAULT_FOP:
974	error = (args->args.fop_args.doio)(args->args.fop_args.fp,
975	uio, args->cred, args->flags, td);
976	break;
977	case VN_IO_FAULT_VOP:
978	if (uio->uio_rw == UIO_READ) {
979	error = VOP_READ(args->args.vop_args.vp, uio,
980	args->flags, args->cred);
981	} else if (uio->uio_rw == UIO_WRITE) {
982	error = VOP_WRITE(args->args.vop_args.vp, uio,
983	args->flags, args->cred);
984	}
985	break;
986	default:
987	panic("vn_io_fault_doio: unknown kind of io %d %d",
988	args->kind, uio->uio_rw);
989	}
990	#ifndef __rtems__
991	vm_fault_enable_pagefaults(save);
992	#endif /* __rtems__ */
993	return (error);
994	}
995
996	static int
997	vn_io_fault_touch(char base, const struct uio uio)
998	{
999	#ifndef __rtems__
1000	int r;
1001
1002	r = fubyte(base);
1003	if (r == -1 \|\| (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1004	return (EFAULT);
1005	return (0);
1006	#else /* __rtems__ */
1007	return (EFAULT);
1008	#endif /* __rtems__ */
1009	}
1010
1011	static int
1012	vn_io_fault_prefault_user(const struct uio *uio)
1013	{
1014	#ifndef __rtems__
1015	char *base;
1016	const struct iovec *iov;
1017	size_t len;
1018	ssize_t resid;
1019	int error, i;
1020
1021	KASSERT(uio->uio_segflg == UIO_USERSPACE,
1022	("vn_io_fault_prefault userspace"));
1023
1024	error = i = 0;
1025	iov = uio->uio_iov;
1026	resid = uio->uio_resid;
1027	base = iov->iov_base;
1028	len = iov->iov_len;
1029	while (resid > 0) {
1030	error = vn_io_fault_touch(base, uio);
1031	if (error != 0)
1032	break;
1033	if (len < PAGE_SIZE) {
1034	if (len != 0) {
1035	error = vn_io_fault_touch(base + len - 1, uio);
1036	if (error != 0)
1037	break;
1038	resid -= len;
1039	}
1040	if (++i >= uio->uio_iovcnt)
1041	break;
1042	iov = uio->uio_iov + i;
1043	base = iov->iov_base;
1044	len = iov->iov_len;
1045	} else {
1046	len -= PAGE_SIZE;
1047	base += PAGE_SIZE;
1048	resid -= PAGE_SIZE;
1049	}
1050	}
1051	return (error);
1052	#else /* __rtems__ */
1053	return (EFAULT);
1054	#endif /* __rtems__ */
1055	}
1056
1057	/*
1058	* Common code for vn_io_fault(), agnostic to the kind of i/o request.
1059	* Uses vn_io_fault_doio() to make the call to an actual i/o function.
1060	* Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1061	* into args and call vn_io_fault1() to handle faults during the user
1062	* mode buffer accesses.
1063	*/
1064	static int
1065	vn_io_fault1(struct vnode vp, struct uio uio, struct vn_io_fault_args *args,
1066	struct thread *td)
1067	{
1068	vm_page_t ma[io_hold_cnt + 2];
1069	struct uio *uio_clone, short_uio;
1070	struct iovec short_iovec[1];
1071	vm_page_t *prev_td_ma;
1072	vm_prot_t prot;
1073	vm_offset_t addr, end;
1074	size_t len, resid;
1075	ssize_t adv;
1076	int error, cnt, saveheld, prev_td_ma_cnt;
1077	#ifdef __rtems__
1078	struct uio uio_clone_;
1079	#endif /* __rtems__ */
1080
1081	if (vn_io_fault_prefault) {
1082	error = vn_io_fault_prefault_user(uio);
1083	if (error != 0)
1084	return (error); /* Or ignore ? */
1085	}
1086
1087	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1088
1089	/*
1090	* The UFS follows IO_UNIT directive and replays back both
1091	* uio_offset and uio_resid if an error is encountered during the
1092	* operation. But, since the iovec may be already advanced,
1093	* uio is still in an inconsistent state.
1094	*
1095	* Cache a copy of the original uio, which is advanced to the redo
1096	* point using UIO_NOCOPY below.
1097	*/
1098	#ifndef __rtems__
1099	uio_clone = cloneuio(uio);
1100	#else /* __rtems__ */
1101	uio_clone_ = *uio;
1102	uio_clone = &uio_clone_;
1103	#endif /* __rtems__ */
1104	resid = uio->uio_resid;
1105
1106	short_uio.uio_segflg = UIO_USERSPACE;
1107	short_uio.uio_rw = uio->uio_rw;
1108	short_uio.uio_td = uio->uio_td;
1109
1110	error = vn_io_fault_doio(args, uio, td);
1111	if (error != EFAULT)
1112	goto out;
1113
1114	atomic_add_long(&vn_io_faults_cnt, 1);
1115	uio_clone->uio_segflg = UIO_NOCOPY;
1116	uiomove(NULL, resid - uio->uio_resid, uio_clone);
1117	uio_clone->uio_segflg = uio->uio_segflg;
1118
1119	#ifndef __rtems__
1120	saveheld = curthread_pflags_set(TDP_UIOHELD);
1121	prev_td_ma = td->td_ma;
1122	prev_td_ma_cnt = td->td_ma_cnt;
1123	#endif /* __rtems__ */
1124
1125	while (uio_clone->uio_resid != 0) {
1126	len = uio_clone->uio_iov->iov_len;
1127	if (len == 0) {
1128	KASSERT(uio_clone->uio_iovcnt >= 1,
1129	("iovcnt underflow"));
1130	uio_clone->uio_iov++;
1131	uio_clone->uio_iovcnt--;
1132	continue;
1133	}
1134	if (len > io_hold_cnt * PAGE_SIZE)
1135	len = io_hold_cnt * PAGE_SIZE;
1136	addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1137	end = round_page(addr + len);
1138	if (end < addr) {
1139	error = EFAULT;
1140	break;
1141	}
1142	cnt = atop(end - trunc_page(addr));
1143	#ifndef __rtems__
1144	/*
1145	* A perfectly misaligned address and length could cause
1146	* both the start and the end of the chunk to use partial
1147	* page. +2 accounts for such a situation.
1148	*/
1149	cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1150	addr, len, prot, ma, io_hold_cnt + 2);
1151	if (cnt == -1) {
1152	error = EFAULT;
1153	break;
1154	}
1155	#endif /* __rtems__ */
1156	short_uio.uio_iov = &short_iovec[0];
1157	short_iovec[0].iov_base = (void *)addr;
1158	short_uio.uio_iovcnt = 1;
1159	short_uio.uio_resid = short_iovec[0].iov_len = len;
1160	short_uio.uio_offset = uio_clone->uio_offset;
1161	#ifndef __rtems__
1162	td->td_ma = ma;
1163	td->td_ma_cnt = cnt;
1164	#endif /* __rtems__ */
1165
1166	error = vn_io_fault_doio(args, &short_uio, td);
1167	#ifndef __rtems__
1168	vm_page_unhold_pages(ma, cnt);
1169	#endif /* __rtems__ */
1170	adv = len - short_uio.uio_resid;
1171
1172	uio_clone->uio_iov->iov_base =
1173	(char *)uio_clone->uio_iov->iov_base + adv;
1174	uio_clone->uio_iov->iov_len -= adv;
1175	uio_clone->uio_resid -= adv;
1176	uio_clone->uio_offset += adv;
1177
1178	uio->uio_resid -= adv;
1179	uio->uio_offset += adv;
1180
1181	if (error != 0 \|\| adv == 0)
1182	break;
1183	}
1184	#ifndef __rtems__
1185	td->td_ma = prev_td_ma;
1186	td->td_ma_cnt = prev_td_ma_cnt;
1187	curthread_pflags_restore(saveheld);
1188	#endif /* __rtems__ */
1189	out:
1190	#ifndef __rtems__
1191	free(uio_clone, M_IOV);
1192	#endif /* __rtems__ */
1193	return (error);
1194	}
1195
1196	static int
1197	vn_io_fault(struct file fp, struct uio uio, struct ucred *active_cred,
1198	int flags, struct thread *td)
1199	{
1200	#ifdef __rtems__
1201	fo_rdwr_t *doio;
1202	struct vnode *vp;
1203	void *rl_cookie;
1204	struct vn_io_fault_args args;
1205	int error;
1206
1207	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1208	vp = fp->f_vnode;
1209	foffset_lock_uio(fp, uio, flags);
1210	if (do_vn_io_fault(vp, uio)) {
1211	args.kind = VN_IO_FAULT_FOP;
1212	args.args.fop_args.fp = fp;
1213	args.args.fop_args.doio = doio;
1214	args.cred = active_cred;
1215	args.flags = flags \| FOF_OFFSET;
1216	if (uio->uio_rw == UIO_READ) {
1217	rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1218	uio->uio_offset + uio->uio_resid);
1219	} else if ((fp->f_flag & O_APPEND) != 0 \|\|
1220	(flags & FOF_OFFSET) == 0) {
1221	/* For appenders, punt and lock the whole range. */
1222	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1223	} else {
1224	rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1225	uio->uio_offset + uio->uio_resid);
1226	}
1227	error = vn_io_fault1(vp, uio, &args, td);
1228	vn_rangelock_unlock(vp, rl_cookie);
1229	} else {
1230	error = doio(fp, uio, active_cred, flags \| FOF_OFFSET, td);
1231	}
1232	foffset_unlock_uio(fp, uio, flags);
1233	return (error);
1234	#else /* __rtems__ */
1235	return (EFAULT);
1236	#endif /* __rtems__ */
1237	}
1238
1239	/*
1240	* Helper function to perform the requested uiomove operation using
1241	* the held pages for io->uio_iov[0].iov_base buffer instead of
1242	* copyin/copyout. Access to the pages with uiomove_fromphys()
1243	* instead of iov_base prevents page faults that could occur due to
1244	* pmap_collect() invalidating the mapping created by
1245	* vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1246	* object cleanup revoking the write access from page mappings.
1247	*
1248	* Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1249	* instead of plain uiomove().
1250	*/
1251	int
1252	vn_io_fault_uiomove(char data, int xfersize, struct uio uio)
1253	{
1254	#ifndef __rtems__
1255	return (EFAULT);
1256	struct uio transp_uio;
1257	struct iovec transp_iov[1];
1258	struct thread *td;
1259	size_t adv;
1260	int error, pgadv;
1261
1262	td = curthread;
1263	if ((td->td_pflags & TDP_UIOHELD) == 0 \|\|
1264	uio->uio_segflg != UIO_USERSPACE)
1265	#endif /* __rtems__ */
1266	return (uiomove(data, xfersize, uio));
1267
1268	#ifndef __rtems__
1269	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1270	transp_iov[0].iov_base = data;
1271	transp_uio.uio_iov = &transp_iov[0];
1272	transp_uio.uio_iovcnt = 1;
1273	if (xfersize > uio->uio_resid)
1274	xfersize = uio->uio_resid;
1275	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1276	transp_uio.uio_offset = 0;
1277	transp_uio.uio_segflg = UIO_SYSSPACE;
1278	/*
1279	* Since transp_iov points to data, and td_ma page array
1280	* corresponds to original uio->uio_iov, we need to invert the
1281	* direction of the i/o operation as passed to
1282	* uiomove_fromphys().
1283	*/
1284	switch (uio->uio_rw) {
1285	case UIO_WRITE:
1286	transp_uio.uio_rw = UIO_READ;
1287	break;
1288	case UIO_READ:
1289	transp_uio.uio_rw = UIO_WRITE;
1290	break;
1291	}
1292	transp_uio.uio_td = uio->uio_td;
1293	error = uiomove_fromphys(td->td_ma,
1294	((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1295	xfersize, &transp_uio);
1296	adv = xfersize - transp_uio.uio_resid;
1297	pgadv =
1298	(((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1299	(((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1300	td->td_ma += pgadv;
1301	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1302	pgadv));
1303	td->td_ma_cnt -= pgadv;
1304	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1305	uio->uio_iov->iov_len -= adv;
1306	uio->uio_resid -= adv;
1307	uio->uio_offset += adv;
1308	return (error);
1309	#endif /* __rtems__ */
1310	}
1311
1312	int
1313	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1314	struct uio *uio)
1315	{
1316	#ifndef __rtems__
1317	struct thread *td;
1318	vm_offset_t iov_base;
1319	int cnt, pgadv;
1320
1321	td = curthread;
1322	if ((td->td_pflags & TDP_UIOHELD) == 0 \|\|
1323	uio->uio_segflg != UIO_USERSPACE)
1324	return (uiomove_fromphys(ma, offset, xfersize, uio));
1325
1326	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1327	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1328	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1329	switch (uio->uio_rw) {
1330	case UIO_WRITE:
1331	pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1332	offset, cnt);
1333	break;
1334	case UIO_READ:
1335	pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1336	cnt);
1337	break;
1338	}
1339	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1340	td->td_ma += pgadv;
1341	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1342	pgadv));
1343	td->td_ma_cnt -= pgadv;
1344	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1345	uio->uio_iov->iov_len -= cnt;
1346	uio->uio_resid -= cnt;
1347	uio->uio_offset += cnt;
1348	return (0);
1349	#else /* __rtems__ */
1350	return (EFAULT);
1351	#endif /* __rtems__ */
1352	}
1353
1354
1355	/*
1356	* File table truncate routine.
1357	*/
1358	static int
1359	vn_truncate(struct file fp, off_t length, struct ucred active_cred,
1360	struct thread *td)
1361	{
1362	struct vattr vattr;
1363	struct mount *mp;
1364	struct vnode *vp;
1365	void *rl_cookie;
1366	int error;
1367
1368	vp = fp->f_vnode;
1369
1370	/*
1371	* Lock the whole range for truncation. Otherwise split i/o
1372	* might happen partly before and partly after the truncation.
1373	*/
1374	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1375	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
1376	if (error)
1377	goto out1;
1378	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
1379	AUDIT_ARG_VNODE1(vp);
1380	if (vp->v_type == VDIR) {
1381	error = EISDIR;
1382	goto out;
1383	}
1384	#ifdef MAC
1385	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1386	if (error)
1387	goto out;
1388	#endif
1389	error = VOP_ADD_WRITECOUNT(vp, 1);
1390	if (error == 0) {
1391	VATTR_NULL(&vattr);
1392	vattr.va_size = length;
1393	if ((fp->f_flag & O_FSYNC) != 0)
1394	vattr.va_vaflags \|= VA_SYNC;
1395	error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1396	VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1397	}
1398	out:
1399	VOP_UNLOCK(vp, 0);
1400	vn_finished_write(mp);
1401	out1:
1402	vn_rangelock_unlock(vp, rl_cookie);
1403	return (error);
1404	}
1405
1406	/*
1407	* File table vnode stat routine.
1408	*/
1409	static int
1410	vn_statfile(struct file fp, struct stat sb, struct ucred *active_cred,
1411	struct thread *td)
1412	{
1413	struct vnode *vp = fp->f_vnode;
1414	int error;
1415
1416	vn_lock(vp, LK_SHARED \| LK_RETRY);
1417	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1418	VOP_UNLOCK(vp, 0);
1419
1420	return (error);
1421	}
1422
1423	/*
1424	* Stat a vnode; implementation for the stat syscall
1425	*/
1426	int
1427	vn_stat(struct vnode vp, struct stat sb, struct ucred *active_cred,
1428	struct ucred file_cred, struct thread td)
1429	{
1430	struct vattr vattr;
1431	struct vattr *vap;
1432	int error;
1433	u_short mode;
1434
1435	AUDIT_ARG_VNODE1(vp);
1436	#ifdef MAC
1437	error = mac_vnode_check_stat(active_cred, file_cred, vp);
1438	if (error)
1439	return (error);
1440	#endif
1441
1442	vap = &vattr;
1443
1444	/*
1445	* Initialize defaults for new and unusual fields, so that file
1446	* systems which don't support these fields don't need to know
1447	* about them.
1448	*/
1449	vap->va_birthtime.tv_sec = -1;
1450	vap->va_birthtime.tv_nsec = 0;
1451	vap->va_fsid = VNOVAL;
1452	vap->va_rdev = NODEV;
1453
1454	error = VOP_GETATTR(vp, vap, active_cred);
1455	if (error)
1456	return (error);
1457
1458	/*
1459	* Zero the spare stat fields
1460	*/
1461	bzero(sb, sizeof *sb);
1462
1463	/*
1464	* Copy from vattr table
1465	*/
1466	if (vap->va_fsid != VNOVAL)
1467	sb->st_dev = vap->va_fsid;
1468	else
1469	sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1470	sb->st_ino = vap->va_fileid;
1471	mode = vap->va_mode;
1472	switch (vap->va_type) {
1473	case VREG:
1474	mode \|= S_IFREG;
1475	break;
1476	case VDIR:
1477	mode \|= S_IFDIR;
1478	break;
1479	case VBLK:
1480	mode \|= S_IFBLK;
1481	break;
1482	case VCHR:
1483	mode \|= S_IFCHR;
1484	break;
1485	case VLNK:
1486	mode \|= S_IFLNK;
1487	break;
1488	case VSOCK:
1489	mode \|= S_IFSOCK;
1490	break;
1491	case VFIFO:
1492	mode \|= S_IFIFO;
1493	break;
1494	default:
1495	return (EBADF);
1496	}
1497	sb->st_mode = mode;
1498	sb->st_nlink = vap->va_nlink;
1499	sb->st_uid = vap->va_uid;
1500	sb->st_gid = vap->va_gid;
1501	sb->st_rdev = vap->va_rdev;
1502	if (vap->va_size > OFF_MAX)
1503	return (EOVERFLOW);
1504	sb->st_size = vap->va_size;
1505	sb->st_atim = vap->va_atime;
1506	sb->st_mtim = vap->va_mtime;
1507	sb->st_ctim = vap->va_ctime;
1508	#ifndef __rtems__
1509	sb->st_birthtim = vap->va_birthtime;
1510	#endif /* __rtems__ */
1511
1512	/*
1513	* According to www.opengroup.org, the meaning of st_blksize is
1514	* "a filesystem-specific preferred I/O block size for this
1515	* object. In some filesystem types, this may vary from file
1516	* to file"
1517	* Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1518	*/
1519
1520	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1521
1522	#ifndef __rtems__
1523	sb->st_flags = vap->va_flags;
1524	if (priv_check(td, PRIV_VFS_GENERATION))
1525	sb->st_gen = 0;
1526	else
1527	sb->st_gen = vap->va_gen;
1528	#endif /* __rtems__ */
1529
1530	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1531	return (0);
1532	}
1533
1534	/*
1535	* File table vnode ioctl routine.
1536	*/
1537	static int
1538	vn_ioctl(struct file fp, u_long com, void data, struct ucred *active_cred,
1539	struct thread *td)
1540	{
1541	struct vattr vattr;
1542	struct vnode *vp;
1543	struct fiobmap2_arg *bmarg;
1544	int error;
1545
1546	vp = fp->f_vnode;
1547	switch (vp->v_type) {
1548	case VDIR:
1549	case VREG:
1550	switch (com) {
1551	case FIONREAD:
1552	vn_lock(vp, LK_SHARED \| LK_RETRY);
1553	error = VOP_GETATTR(vp, &vattr, active_cred);
1554	VOP_UNLOCK(vp, 0);
1555	if (error == 0)
1556	(int )data = vattr.va_size - fp->f_offset;
1557	return (error);
1558	#ifndef __rtems__
1559	case FIOBMAP2:
1560	bmarg = (struct fiobmap2_arg *)data;
1561	vn_lock(vp, LK_SHARED \| LK_RETRY);
1562	#ifdef MAC
1563	error = mac_vnode_check_read(active_cred, fp->f_cred,
1564	vp);
1565	if (error == 0)
1566	#endif
1567	error = VOP_BMAP(vp, bmarg->bn, NULL,
1568	&bmarg->bn, &bmarg->runp, &bmarg->runb);
1569	VOP_UNLOCK(vp, 0);
1570	return (error);
1571	#endif /* __rtems__ */
1572	case FIONBIO:
1573	case FIOASYNC:
1574	return (0);
1575	default:
1576	return (VOP_IOCTL(vp, com, data, fp->f_flag,
1577	active_cred, td));
1578	}
1579	break;
1580	case VCHR:
1581	return (VOP_IOCTL(vp, com, data, fp->f_flag,
1582	active_cred, td));
1583	default:
1584	return (ENOTTY);
1585	}
1586	}
1587
1588	/*
1589	* File table vnode poll routine.
1590	*/
1591	static int
1592	vn_poll(struct file fp, int events, struct ucred active_cred,
1593	struct thread *td)
1594	{
1595	struct vnode *vp;
1596	int error;
1597
1598	vp = fp->f_vnode;
1599	#ifdef MAC
1600	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
1601	AUDIT_ARG_VNODE1(vp);
1602	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1603	VOP_UNLOCK(vp, 0);
1604	if (!error)
1605	#endif
1606
1607	error = VOP_POLL(vp, events, fp->f_cred, td);
1608	return (error);
1609	}
1610
1611	/*
1612	* Acquire the requested lock and then check for validity. LK_RETRY
1613	* permits vn_lock to return doomed vnodes.
1614	*/
1615	int
1616	_vn_lock(struct vnode vp, int flags, char file, int line)
1617	{
1618	int error;
1619
1620	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1621	("vn_lock: no locktype"));
1622	VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
1623	retry:
1624	error = VOP_LOCK1(vp, flags, file, line);
1625	flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
1626	KASSERT((flags & LK_RETRY) == 0 \|\| error == 0,
1627	("vn_lock: error %d incompatible with flags %#x", error, flags));
1628
1629	if ((flags & LK_RETRY) == 0) {
1630	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
1631	VOP_UNLOCK(vp, 0);
1632	error = ENOENT;
1633	}
1634	} else if (error != 0)
1635	goto retry;
1636	return (error);
1637	}
1638
1639	/*
1640	* File table vnode close routine.
1641	*/
1642	static int
1643	vn_closefile(struct file fp, struct thread td)
1644	{
1645	struct vnode *vp;
1646	struct flock lf;
1647	int error;
1648	bool ref;
1649
1650	vp = fp->f_vnode;
1651	fp->f_ops = &badfileops;
1652	#ifndef __rtems__
1653	ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1654	#else /* __rtems__ */
1655	ref = false;
1656	#endif /* __rtems__ */
1657
1658	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1659
1660	if (__predict_false(ref)) {
1661	lf.l_whence = SEEK_SET;
1662	lf.l_start = 0;
1663	lf.l_len = 0;
1664	lf.l_type = F_UNLCK;
1665	(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1666	vrele(vp);
1667	}
1668	return (error);
1669	}
1670
1671	static bool
1672	vn_suspendable(struct mount *mp)
1673	{
1674
1675	return (mp->mnt_op->vfs_susp_clean != NULL);
1676	}
1677
1678	/*
1679	* Preparing to start a filesystem write operation. If the operation is
1680	* permitted, then we bump the count of operations in progress and
1681	* proceed. If a suspend request is in progress, we wait until the
1682	* suspension is over, and then proceed.
1683	*/
1684	static int
1685	vn_start_write_locked(struct mount *mp, int flags)
1686	{
1687	int error, mflags;
1688
1689	mtx_assert(MNT_MTX(mp), MA_OWNED);
1690	error = 0;
1691
1692	/*
1693	* Check on status of suspension.
1694	*/
1695	#ifndef __rtems__
1696	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 \|\|
1697	#else /* __rtems__ */
1698	if (
1699	#endif /* __rtems__ */
1700	mp->mnt_susp_owner != curthread) {
1701	mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1702	(flags & PCATCH) : 0) \| (PUSER - 1);
1703	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1704	if (flags & V_NOWAIT) {
1705	error = EWOULDBLOCK;
1706	goto unlock;
1707	}
1708	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1709	"suspfs", 0);
1710	if (error)
1711	goto unlock;
1712	}
1713	}
1714	if (flags & V_XSLEEP)
1715	goto unlock;
1716	mp->mnt_writeopcount++;
1717	unlock:
1718	if (error != 0 \|\| (flags & V_XSLEEP) != 0)
1719	MNT_REL(mp);
1720	MNT_IUNLOCK(mp);
1721	return (error);
1722	}
1723
1724	int
1725	vn_start_write(struct vnode vp, struct mount *mpp, int flags)
1726	{
1727	struct mount *mp;
1728	int error;
1729
1730	KASSERT((flags & V_MNTREF) == 0 \|\| (*mpp != NULL && vp == NULL),
1731	("V_MNTREF requires mp"));
1732
1733	error = 0;
1734	/*
1735	* If a vnode is provided, get and return the mount point that
1736	* to which it will write.
1737	*/
1738	if (vp != NULL) {
1739	if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1740	*mpp = NULL;
1741	if (error != EOPNOTSUPP)
1742	return (error);
1743	return (0);
1744	}
1745	}
1746	if ((mp = *mpp) == NULL)
1747	return (0);
1748
1749	if (!vn_suspendable(mp)) {
1750	if (vp != NULL \|\| (flags & V_MNTREF) != 0)
1751	vfs_rel(mp);
1752	return (0);
1753	}
1754
1755	/*
1756	* VOP_GETWRITEMOUNT() returns with the mp refcount held through
1757	* a vfs_ref().
1758	* As long as a vnode is not provided we need to acquire a
1759	* refcount for the provided mountpoint too, in order to
1760	* emulate a vfs_ref().
1761	*/
1762	MNT_ILOCK(mp);
1763	if (vp == NULL && (flags & V_MNTREF) == 0)
1764	MNT_REF(mp);
1765
1766	return (vn_start_write_locked(mp, flags));
1767	}
1768
1769	/*
1770	* Secondary suspension. Used by operations such as vop_inactive
1771	* routines that are needed by the higher level functions. These
1772	* are allowed to proceed until all the higher level functions have
1773	* completed (indicated by mnt_writeopcount dropping to zero). At that
1774	* time, these operations are halted until the suspension is over.
1775	*/
1776	int
1777	vn_start_secondary_write(struct vnode vp, struct mount *mpp, int flags)
1778	{
1779	struct mount *mp;
1780	int error;
1781
1782	KASSERT((flags & V_MNTREF) == 0 \|\| (*mpp != NULL && vp == NULL),
1783	("V_MNTREF requires mp"));
1784
1785	retry:
1786	if (vp != NULL) {
1787	if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1788	*mpp = NULL;
1789	if (error != EOPNOTSUPP)
1790	return (error);
1791	return (0);
1792	}
1793	}
1794	/*
1795	* If we are not suspended or have not yet reached suspended
1796	* mode, then let the operation proceed.
1797	*/
1798	if ((mp = *mpp) == NULL)
1799	return (0);
1800
1801	if (!vn_suspendable(mp)) {
1802	if (vp != NULL \|\| (flags & V_MNTREF) != 0)
1803	vfs_rel(mp);
1804	return (0);
1805	}
1806
1807	/*
1808	* VOP_GETWRITEMOUNT() returns with the mp refcount held through
1809	* a vfs_ref().
1810	* As long as a vnode is not provided we need to acquire a
1811	* refcount for the provided mountpoint too, in order to
1812	* emulate a vfs_ref().
1813	*/
1814	MNT_ILOCK(mp);
1815	if (vp == NULL && (flags & V_MNTREF) == 0)
1816	MNT_REF(mp);
1817	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED \| MNTK_SUSPEND2)) == 0) {
1818	mp->mnt_secondary_writes++;
1819	mp->mnt_secondary_accwrites++;
1820	MNT_IUNLOCK(mp);
1821	return (0);
1822	}
1823	if (flags & V_NOWAIT) {
1824	MNT_REL(mp);
1825	MNT_IUNLOCK(mp);
1826	return (EWOULDBLOCK);
1827	}
1828	/*
1829	* Wait for the suspension to finish.
1830	*/
1831	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) \| PDROP \|
1832	((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1833	"suspfs", 0);
1834	vfs_rel(mp);
1835	if (error == 0)
1836	goto retry;
1837	return (error);
1838	}
1839
1840	/*
1841	* Filesystem write operation has completed. If we are suspending and this
1842	* operation is the last one, notify the suspender that the suspension is
1843	* now in effect.
1844	*/
1845	void
1846	vn_finished_write(struct mount *mp)
1847	{
1848	if (mp == NULL \|\| !vn_suspendable(mp))
1849	return;
1850	MNT_ILOCK(mp);
1851	MNT_REL(mp);
1852	mp->mnt_writeopcount--;
1853	if (mp->mnt_writeopcount < 0)
1854	panic("vn_finished_write: neg cnt");
1855	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1856	mp->mnt_writeopcount <= 0)
1857	wakeup(&mp->mnt_writeopcount);
1858	MNT_IUNLOCK(mp);
1859	}
1860
1861
1862	/*
1863	* Filesystem secondary write operation has completed. If we are
1864	* suspending and this operation is the last one, notify the suspender
1865	* that the suspension is now in effect.
1866	*/
1867	void
1868	vn_finished_secondary_write(struct mount *mp)
1869	{
1870	if (mp == NULL \|\| !vn_suspendable(mp))
1871	return;
1872	MNT_ILOCK(mp);
1873	MNT_REL(mp);
1874	mp->mnt_secondary_writes--;
1875	if (mp->mnt_secondary_writes < 0)
1876	panic("vn_finished_secondary_write: neg cnt");
1877	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1878	mp->mnt_secondary_writes <= 0)
1879	wakeup(&mp->mnt_secondary_writes);
1880	MNT_IUNLOCK(mp);
1881	}
1882
1883
1884
1885	/*
1886	* Request a filesystem to suspend write operations.
1887	*/
1888	int
1889	vfs_write_suspend(struct mount *mp, int flags)
1890	{
1891	int error;
1892
1893	MPASS(vn_suspendable(mp));
1894
1895	MNT_ILOCK(mp);
1896	if (mp->mnt_susp_owner == curthread) {
1897	MNT_IUNLOCK(mp);
1898	return (EALREADY);
1899	}
1900	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1901	msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1902
1903	/*
1904	* Unmount holds a write reference on the mount point. If we
1905	* own busy reference and drain for writers, we deadlock with
1906	* the reference draining in the unmount path. Callers of
1907	* vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
1908	* vfs_busy() reference is owned and caller is not in the
1909	* unmount context.
1910	*/
1911	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
1912	(mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1913	MNT_IUNLOCK(mp);
1914	return (EBUSY);
1915	}
1916
1917	mp->mnt_kern_flag \|= MNTK_SUSPEND;
1918	mp->mnt_susp_owner = curthread;
1919	if (mp->mnt_writeopcount > 0)
1920	(void) msleep(&mp->mnt_writeopcount,
1921	MNT_MTX(mp), (PUSER - 1)\|PDROP, "suspwt", 0);
1922	else
1923	MNT_IUNLOCK(mp);
1924	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1925	vfs_write_resume(mp, 0);
1926	return (error);
1927	}
1928
1929	/*
1930	* Request a filesystem to resume write operations.
1931	*/
1932	void
1933	vfs_write_resume(struct mount *mp, int flags)
1934	{
1935
1936	MPASS(vn_suspendable(mp));
1937
1938	MNT_ILOCK(mp);
1939	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1940	KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1941	mp->mnt_kern_flag &= ~(MNTK_SUSPEND \| MNTK_SUSPEND2 \|
1942	MNTK_SUSPENDED);
1943	mp->mnt_susp_owner = NULL;
1944	wakeup(&mp->mnt_writeopcount);
1945	wakeup(&mp->mnt_flag);
1946	#ifndef __rtems__
1947	curthread->td_pflags &= ~TDP_IGNSUSP;
1948	#endif /* __rtems__ */
1949	if ((flags & VR_START_WRITE) != 0) {
1950	MNT_REF(mp);
1951	mp->mnt_writeopcount++;
1952	}
1953	MNT_IUNLOCK(mp);
1954	if ((flags & VR_NO_SUSPCLR) == 0)
1955	VFS_SUSP_CLEAN(mp);
1956	} else if ((flags & VR_START_WRITE) != 0) {
1957	MNT_REF(mp);
1958	vn_start_write_locked(mp, 0);
1959	} else {
1960	MNT_IUNLOCK(mp);
1961	}
1962	}
1963
1964	/*
1965	* Helper loop around vfs_write_suspend() for filesystem unmount VFS
1966	* methods.
1967	*/
1968	int
1969	vfs_write_suspend_umnt(struct mount *mp)
1970	{
1971	int error;
1972
1973	MPASS(vn_suspendable(mp));
1974	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
1975	("vfs_write_suspend_umnt: recursed"));
1976
1977	/* dounmount() already called vn_start_write(). */
1978	for (;;) {
1979	vn_finished_write(mp);
1980	error = vfs_write_suspend(mp, 0);
1981	if (error != 0) {
1982	vn_start_write(NULL, &mp, V_WAIT);
1983	return (error);
1984	}
1985	MNT_ILOCK(mp);
1986	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1987	break;
1988	MNT_IUNLOCK(mp);
1989	vn_start_write(NULL, &mp, V_WAIT);
1990	}
1991	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED \| MNTK_SUSPEND2);
1992	wakeup(&mp->mnt_flag);
1993	MNT_IUNLOCK(mp);
1994	#ifndef __rtems__
1995	curthread->td_pflags \|= TDP_IGNSUSP;
1996	#endif /* __rtems__ */
1997	return (0);
1998	}
1999
2000	/*
2001	* Implement kqueues for files by translating it to vnode operation.
2002	*/
2003	static int
2004	vn_kqfilter(struct file fp, struct knote kn)
2005	{
2006
2007	return (VOP_KQFILTER(fp->f_vnode, kn));
2008	}
2009
2010	/*
2011	* Simplified in-kernel wrapper calls for extended attribute access.
2012	* Both calls pass in a NULL credential, authorizing as "kernel" access.
2013	* Set IO_NODELOCKED in ioflg if the vnode is already locked.
2014	*/
2015	int
2016	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2017	const char attrname, int buflen, char buf, struct thread td)
2018	{
2019	struct uio auio;
2020	struct iovec iov;
2021	int error;
2022
2023	iov.iov_len = *buflen;
2024	iov.iov_base = buf;
2025
2026	auio.uio_iov = &iov;
2027	auio.uio_iovcnt = 1;
2028	auio.uio_rw = UIO_READ;
2029	auio.uio_segflg = UIO_SYSSPACE;
2030	auio.uio_td = td;
2031	auio.uio_offset = 0;
2032	auio.uio_resid = *buflen;
2033
2034	if ((ioflg & IO_NODELOCKED) == 0)
2035	vn_lock(vp, LK_SHARED \| LK_RETRY);
2036
2037	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2038
2039	/* authorize attribute retrieval as kernel */
2040	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2041	td);
2042
2043	if ((ioflg & IO_NODELOCKED) == 0)
2044	VOP_UNLOCK(vp, 0);
2045
2046	if (error == 0) {
2047	buflen = buflen - auio.uio_resid;
2048	}
2049
2050	return (error);
2051	}
2052
2053	/*
2054	* XXX failure mode if partially written?
2055	*/
2056	int
2057	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2058	const char attrname, int buflen, char buf, struct thread *td)
2059	{
2060	struct uio auio;
2061	struct iovec iov;
2062	struct mount *mp;
2063	int error;
2064
2065	iov.iov_len = buflen;
2066	iov.iov_base = buf;
2067
2068	auio.uio_iov = &iov;
2069	auio.uio_iovcnt = 1;
2070	auio.uio_rw = UIO_WRITE;
2071	auio.uio_segflg = UIO_SYSSPACE;
2072	auio.uio_td = td;
2073	auio.uio_offset = 0;
2074	auio.uio_resid = buflen;
2075
2076	if ((ioflg & IO_NODELOCKED) == 0) {
2077	if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2078	return (error);
2079	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
2080	}
2081
2082	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2083
2084	/* authorize attribute setting as kernel */
2085	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2086
2087	if ((ioflg & IO_NODELOCKED) == 0) {
2088	vn_finished_write(mp);
2089	VOP_UNLOCK(vp, 0);
2090	}
2091
2092	return (error);
2093	}
2094
2095	int
2096	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2097	const char attrname, struct thread td)
2098	{
2099	struct mount *mp;
2100	int error;
2101
2102	if ((ioflg & IO_NODELOCKED) == 0) {
2103	if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2104	return (error);
2105	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
2106	}
2107
2108	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2109
2110	/* authorize attribute removal as kernel */
2111	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2112	if (error == EOPNOTSUPP)
2113	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2114	NULL, td);
2115
2116	if ((ioflg & IO_NODELOCKED) == 0) {
2117	vn_finished_write(mp);
2118	VOP_UNLOCK(vp, 0);
2119	}
2120
2121	return (error);
2122	}
2123
2124	static int
2125	vn_get_ino_alloc_vget(struct mount mp, void arg, int lkflags,
2126	struct vnode **rvp)
2127	{
2128
2129	return (VFS_VGET(mp, (ino_t )arg, lkflags, rvp));
2130	}
2131
2132	int
2133	vn_vget_ino(struct vnode vp, ino_t ino, int lkflags, struct vnode *rvp)
2134	{
2135
2136	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2137	lkflags, rvp));
2138	}
2139
2140	int
2141	vn_vget_ino_gen(struct vnode vp, vn_get_ino_t alloc, void alloc_arg,
2142	int lkflags, struct vnode **rvp)
2143	{
2144	struct mount *mp;
2145	int ltype, error;
2146
2147	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2148	mp = vp->v_mount;
2149	ltype = VOP_ISLOCKED(vp);
2150	KASSERT(ltype == LK_EXCLUSIVE \|\| ltype == LK_SHARED,
2151	("vn_vget_ino: vp not locked"));
2152	error = vfs_busy(mp, MBF_NOWAIT);
2153	if (error != 0) {
2154	vfs_ref(mp);
2155	VOP_UNLOCK(vp, 0);
2156	error = vfs_busy(mp, 0);
2157	vn_lock(vp, ltype \| LK_RETRY);
2158	vfs_rel(mp);
2159	if (error != 0)
2160	return (ENOENT);
2161	if (vp->v_iflag & VI_DOOMED) {
2162	vfs_unbusy(mp);
2163	return (ENOENT);
2164	}
2165	}
2166	VOP_UNLOCK(vp, 0);
2167	error = alloc(mp, alloc_arg, lkflags, rvp);
2168	vfs_unbusy(mp);
2169	if (error != 0 \|\| *rvp != vp)
2170	vn_lock(vp, ltype \| LK_RETRY);
2171	if (vp->v_iflag & VI_DOOMED) {
2172	if (error == 0) {
2173	if (*rvp == vp)
2174	vunref(vp);
2175	else
2176	vput(*rvp);
2177	}
2178	error = ENOENT;
2179	}
2180	return (error);
2181	}
2182
2183	int
2184	vn_rlimit_fsize(const struct vnode vp, const struct uio uio,
2185	struct thread *td)
2186	{
2187
2188	if (vp->v_type != VREG \|\| td == NULL)
2189	return (0);
2190	if ((uoff_t)uio->uio_offset + uio->uio_resid >
2191	lim_cur(td, RLIMIT_FSIZE)) {
2192	#ifndef __rtems__
2193	PROC_LOCK(td->td_proc);
2194	kern_psignal(td->td_proc, SIGXFSZ);
2195	PROC_UNLOCK(td->td_proc);
2196	#endif /* __rtems__ */
2197	return (EFBIG);
2198	}
2199	return (0);
2200	}
2201
2202	int
2203	vn_chmod(struct file fp, mode_t mode, struct ucred active_cred,
2204	struct thread *td)
2205	{
2206	struct vnode *vp;
2207
2208	vp = fp->f_vnode;
2209	#ifdef AUDIT
2210	vn_lock(vp, LK_SHARED \| LK_RETRY);
2211	AUDIT_ARG_VNODE1(vp);
2212	VOP_UNLOCK(vp, 0);
2213	#endif
2214	return (setfmode(td, active_cred, vp, mode));
2215	}
2216
2217	int
2218	vn_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
2219	struct thread *td)
2220	{
2221	struct vnode *vp;
2222
2223	vp = fp->f_vnode;
2224	#ifdef AUDIT
2225	vn_lock(vp, LK_SHARED \| LK_RETRY);
2226	AUDIT_ARG_VNODE1(vp);
2227	VOP_UNLOCK(vp, 0);
2228	#endif
2229	return (setfown(td, active_cred, vp, uid, gid));
2230	}
2231
2232	void
2233	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2234	{
2235	#ifndef __rtems__
2236	vm_object_t object;
2237
2238	if ((object = vp->v_object) == NULL)
2239	return;
2240	VM_OBJECT_WLOCK(object);
2241	vm_object_page_remove(object, start, end, 0);
2242	VM_OBJECT_WUNLOCK(object);
2243	#endif /* __rtems__ */
2244	}
2245
2246	int
2247	vn_bmap_seekhole(struct vnode vp, u_long cmd, off_t off, struct ucred *cred)
2248	{
2249	struct vattr va;
2250	daddr_t bn, bnp;
2251	uint64_t bsize;
2252	off_t noff;
2253	int error;
2254
2255	KASSERT(cmd == FIOSEEKHOLE \|\| cmd == FIOSEEKDATA,
2256	("Wrong command %lu", cmd));
2257
2258	if (vn_lock(vp, LK_SHARED) != 0)
2259	return (EBADF);
2260	if (vp->v_type != VREG) {
2261	error = ENOTTY;
2262	goto unlock;
2263	}
2264	error = VOP_GETATTR(vp, &va, cred);
2265	if (error != 0)
2266	goto unlock;
2267	noff = *off;
2268	if (noff >= va.va_size) {
2269	error = ENXIO;
2270	goto unlock;
2271	}
2272	bsize = vp->v_mount->mnt_stat.f_iosize;
2273	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2274	noff % bsize) {
2275	error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2276	if (error == EOPNOTSUPP) {
2277	error = ENOTTY;
2278	goto unlock;
2279	}
2280	if ((bnp == -1 && cmd == FIOSEEKHOLE) \|\|
2281	(bnp != -1 && cmd == FIOSEEKDATA)) {
2282	noff = bn * bsize;
2283	if (noff < *off)
2284	noff = *off;
2285	goto unlock;
2286	}
2287	}
2288	if (noff > va.va_size)
2289	noff = va.va_size;
2290	/* noff == va.va_size. There is an implicit hole at the end of file. */
2291	if (cmd == FIOSEEKDATA)
2292	error = ENXIO;
2293	unlock:
2294	VOP_UNLOCK(vp, 0);
2295	if (error == 0)
2296	*off = noff;
2297	return (error);
2298	}
2299
2300	int
2301	vn_seek(struct file fp, off_t offset, int whence, struct thread td)
2302	{
2303	struct ucred *cred;
2304	struct vnode *vp;
2305	struct vattr vattr;
2306	off_t foffset, size;
2307	int error, noneg;
2308
2309	cred = td->td_ucred;
2310	vp = fp->f_vnode;
2311	foffset = foffset_lock(fp, 0);
2312	noneg = (vp->v_type != VCHR);
2313	error = 0;
2314	switch (whence) {
2315	case L_INCR:
2316	if (noneg &&
2317	(foffset < 0 \|\|
2318	(offset > 0 && foffset > OFF_MAX - offset))) {
2319	error = EOVERFLOW;
2320	break;
2321	}
2322	offset += foffset;
2323	break;
2324	case L_XTND:
2325	vn_lock(vp, LK_SHARED \| LK_RETRY);
2326	error = VOP_GETATTR(vp, &vattr, cred);
2327	VOP_UNLOCK(vp, 0);
2328	if (error)
2329	break;
2330
2331	/*
2332	* If the file references a disk device, then fetch
2333	* the media size and use that to determine the ending
2334	* offset.
2335	*/
2336	if (vattr.va_size == 0 && vp->v_type == VCHR &&
2337	fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2338	vattr.va_size = size;
2339	if (noneg &&
2340	(vattr.va_size > OFF_MAX \|\|
2341	(offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2342	error = EOVERFLOW;
2343	break;
2344	}
2345	offset += vattr.va_size;
2346	break;
2347	case L_SET:
2348	break;
2349	case SEEK_DATA:
2350	error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2351	break;
2352	case SEEK_HOLE:
2353	error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2354	break;
2355	default:
2356	error = EINVAL;
2357	}
2358	if (error == 0 && noneg && offset < 0)
2359	error = EINVAL;
2360	if (error != 0)
2361	goto drop;
2362	VFS_KNOTE_UNLOCKED(vp, 0);
2363	td->td_uretoff.tdu_off = offset;
2364	drop:
2365	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2366	return (error);
2367	}
2368
2369	int
2370	vn_utimes_perm(struct vnode vp, struct vattr vap, struct ucred *cred,
2371	struct thread *td)
2372	{
2373	int error;
2374
2375	/*
2376	* Grant permission if the caller is the owner of the file, or
2377	* the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2378	* on the file. If the time pointer is null, then write
2379	* permission on the file is also sufficient.
2380	*
2381	* From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2382	* A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2383	* will be allowed to set the times [..] to the current
2384	* server time.
2385	*/
2386	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2387	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2388	error = VOP_ACCESS(vp, VWRITE, cred, td);
2389	return (error);
2390	}
2391
2392	int
2393	vn_fill_kinfo(struct file fp, struct kinfo_file kif, struct filedesc *fdp)
2394	{
2395	struct vnode *vp;
2396	int error;
2397
2398	if (fp->f_type == DTYPE_FIFO)
2399	kif->kf_type = KF_TYPE_FIFO;
2400	else
2401	kif->kf_type = KF_TYPE_VNODE;
2402	vp = fp->f_vnode;
2403	vref(vp);
2404	FILEDESC_SUNLOCK(fdp);
2405	error = vn_fill_kinfo_vnode(vp, kif);
2406	vrele(vp);
2407	FILEDESC_SLOCK(fdp);
2408	return (error);
2409	}
2410
2411	static inline void
2412	vn_fill_junk(struct kinfo_file *kif)
2413	{
2414	size_t len, olen;
2415
2416	/*
2417	* Simulate vn_fullpath returning changing values for a given
2418	* vp during e.g. coredump.
2419	*/
2420	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2421	olen = strlen(kif->kf_path);
2422	if (len < olen)
2423	strcpy(&kif->kf_path[len - 1], "$");
2424	else
2425	for (; olen < len; olen++)
2426	strcpy(&kif->kf_path[olen], "A");
2427	}
2428
2429	int
2430	vn_fill_kinfo_vnode(struct vnode vp, struct kinfo_file kif)
2431	{
2432	struct vattr va;
2433	char fullpath, freepath;
2434	int error;
2435
2436	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2437	freepath = NULL;
2438	fullpath = "-";
2439	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
2440	if (error == 0) {
2441	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2442	}
2443	if (freepath != NULL)
2444	free(freepath, M_TEMP);
2445
2446	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2447	vn_fill_junk(kif);
2448	);
2449
2450	/*
2451	* Retrieve vnode attributes.
2452	*/
2453	va.va_fsid = VNOVAL;
2454	va.va_rdev = NODEV;
2455	vn_lock(vp, LK_SHARED \| LK_RETRY);
2456	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2457	VOP_UNLOCK(vp, 0);
2458	if (error != 0)
2459	return (error);
2460	if (va.va_fsid != VNOVAL)
2461	kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2462	else
2463	kif->kf_un.kf_file.kf_file_fsid =
2464	vp->v_mount->mnt_stat.f_fsid.val[0];
2465	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2466	kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2467	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2468	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2469	kif->kf_un.kf_file.kf_file_size = va.va_size;
2470	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2471	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2472	kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2473	return (0);
2474	}
2475
2476	#ifndef __rtems__
2477	int
2478	vn_mmap(struct file fp, vm_map_t map, vm_offset_t addr, vm_size_t size,
2479	vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2480	struct thread *td)
2481	{
2482	#ifdef HWPMC_HOOKS
2483	struct pmckern_map_in pkm;
2484	#endif
2485	struct mount *mp;
2486	struct vnode *vp;
2487	vm_object_t object;
2488	vm_prot_t maxprot;
2489	boolean_t writecounted;
2490	int error;
2491
2492	#if defined(COMPAT_FREEBSD7) \|\| defined(COMPAT_FREEBSD6) \|\| \
2493	defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4)
2494	/*
2495	* POSIX shared-memory objects are defined to have
2496	* kernel persistence, and are not defined to support
2497	* read(2)/write(2) -- or even open(2). Thus, we can
2498	* use MAP_ASYNC to trade on-disk coherence for speed.
2499	* The shm_open(3) library routine turns on the FPOSIXSHM
2500	* flag to request this behavior.
2501	*/
2502	if ((fp->f_flag & FPOSIXSHM) != 0)
2503	flags \|= MAP_NOSYNC;
2504	#endif
2505	vp = fp->f_vnode;
2506
2507	/*
2508	* Ensure that file and memory protections are
2509	* compatible. Note that we only worry about
2510	* writability if mapping is shared; in this case,
2511	* current and max prot are dictated by the open file.
2512	* XXX use the vnode instead? Problem is: what
2513	* credentials do we use for determination? What if
2514	* proc does a setuid?
2515	*/
2516	mp = vp->v_mount;
2517	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2518	maxprot = VM_PROT_NONE;
2519	if ((prot & VM_PROT_EXECUTE) != 0)
2520	return (EACCES);
2521	} else
2522	maxprot = VM_PROT_EXECUTE;
2523	if ((fp->f_flag & FREAD) != 0)
2524	maxprot \|= VM_PROT_READ;
2525	else if ((prot & VM_PROT_READ) != 0)
2526	return (EACCES);
2527
2528	/*
2529	* If we are sharing potential changes via MAP_SHARED and we
2530	* are trying to get write permission although we opened it
2531	* without asking for it, bail out.
2532	*/
2533	if ((flags & MAP_SHARED) != 0) {
2534	if ((fp->f_flag & FWRITE) != 0)
2535	maxprot \|= VM_PROT_WRITE;
2536	else if ((prot & VM_PROT_WRITE) != 0)
2537	return (EACCES);
2538	} else {
2539	maxprot \|= VM_PROT_WRITE;
2540	cap_maxprot \|= VM_PROT_WRITE;
2541	}
2542	maxprot &= cap_maxprot;
2543
2544	/*
2545	* For regular files and shared memory, POSIX requires that
2546	* the value of foff be a legitimate offset within the data
2547	* object. In particular, negative offsets are invalid.
2548	* Blocking negative offsets and overflows here avoids
2549	* possible wraparound or user-level access into reserved
2550	* ranges of the data object later. In contrast, POSIX does
2551	* not dictate how offsets are used by device drivers, so in
2552	* the case of a device mapping a negative offset is passed
2553	* on.
2554	*/
2555	if (
2556	#ifdef _LP64
2557	size > OFF_MAX \|\|
2558	#endif
2559	foff < 0 \|\| foff > OFF_MAX - size)
2560	return (EINVAL);
2561
2562	writecounted = FALSE;
2563	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2564	&foff, &object, &writecounted);
2565	if (error != 0)
2566	return (error);
2567	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2568	foff, writecounted, td);
2569	if (error != 0) {
2570	/*
2571	* If this mapping was accounted for in the vnode's
2572	* writecount, then undo that now.
2573	*/
2574	if (writecounted)
2575	vm_pager_release_writecount(object, 0, size);
2576	vm_object_deallocate(object);
2577	}
2578	#ifdef HWPMC_HOOKS
2579	/* Inform hwpmc(4) if an executable is being mapped. */
2580	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2581	if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2582	pkm.pm_file = vp;
2583	pkm.pm_address = (uintptr_t) *addr;
2584	PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2585	}
2586	}
2587	#endif
2588	return (error);
2589	}
2590	#endif /* __rtems__ */
2591
2592	void
2593	vn_fsid(struct vnode vp, struct vattr va)
2594	{
2595	fsid_t *f;
2596
2597	f = &vp->v_mount->mnt_stat.f_fsid;
2598	va->va_fsid = (uint32_t)f->val[1];
2599	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2600	va->va_fsid += (uint32_t)f->val[0];
2601	}
2602
2603	int
2604	vn_fsync_buf(struct vnode *vp, int waitfor)
2605	{
2606	struct buf bp, nbp;
2607	struct bufobj *bo;
2608	struct mount *mp;
2609	int error, maxretry;
2610
2611	error = 0;
2612	maxretry = 10000; /* large, arbitrarily chosen */
2613	mp = NULL;
2614	if (vp->v_type == VCHR) {
2615	VI_LOCK(vp);
2616	mp = vp->v_rdev->si_mountpt;
2617	VI_UNLOCK(vp);
2618	}
2619	bo = &vp->v_bufobj;
2620	BO_LOCK(bo);
2621	loop1:
2622	/*
2623	* MARK/SCAN initialization to avoid infinite loops.
2624	*/
2625	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2626	bp->b_vflags &= ~BV_SCANNED;
2627	bp->b_error = 0;
2628	}
2629
2630	/*
2631	* Flush all dirty buffers associated with a vnode.
2632	*/
2633	loop2:
2634	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2635	if ((bp->b_vflags & BV_SCANNED) != 0)
2636	continue;
2637	bp->b_vflags \|= BV_SCANNED;
2638	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL)) {
2639	if (waitfor != MNT_WAIT)
2640	continue;
2641	if (BUF_LOCK(bp,
2642	LK_EXCLUSIVE \| LK_INTERLOCK \| LK_SLEEPFAIL,
2643	BO_LOCKPTR(bo)) != 0) {
2644	BO_LOCK(bo);
2645	goto loop1;
2646	}
2647	BO_LOCK(bo);
2648	}
2649	BO_UNLOCK(bo);
2650	KASSERT(bp->b_bufobj == bo,
2651	("bp %p wrong b_bufobj %p should be %p",
2652	bp, bp->b_bufobj, bo));
2653	if ((bp->b_flags & B_DELWRI) == 0)
2654	panic("fsync: not dirty");
2655	if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2656	vfs_bio_awrite(bp);
2657	} else {
2658	bremfree(bp);
2659	bawrite(bp);
2660	}
2661	if (maxretry < 1000)
2662	pause("dirty", hz < 1000 ? 1 : hz / 1000);
2663	BO_LOCK(bo);
2664	goto loop2;
2665	}
2666
2667	/*
2668	* If synchronous the caller expects us to completely resolve all
2669	* dirty buffers in the system. Wait for in-progress I/O to
2670	* complete (which could include background bitmap writes), then
2671	* retry if dirty blocks still exist.
2672	*/
2673	if (waitfor == MNT_WAIT) {
2674	bufobj_wwait(bo, 0, 0);
2675	if (bo->bo_dirty.bv_cnt > 0) {
2676	/*
2677	* If we are unable to write any of these buffers
2678	* then we fail now rather than trying endlessly
2679	* to write them out.
2680	*/
2681	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2682	if ((error = bp->b_error) != 0)
2683	break;
2684	if ((mp != NULL && mp->mnt_secondary_writes > 0) \|\|
2685	(error == 0 && --maxretry >= 0))
2686	goto loop1;
2687	if (error == 0)
2688	error = EAGAIN;
2689	}
2690	}
2691	BO_UNLOCK(bo);
2692	if (error != 0)
2693	vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2694
2695	return (error);
2696	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: rtems-libbsd/freebsd/sys/kern/vfs_vnops.c @ 6514d56

Download in other formats: