source: rtems-libbsd/freebsd/sys/kern/sys_pipe.c

6-freebsd-12
Last change on this file was 6514d56, checked in by Chris Johns <chrisj@…>, on 08/02/21 at 05:09:41

sys/kern: Add VFS support

  • Refactor the libio interface
  • Move syscalls into an rtemsbsd location
  • Provide a root directory mount point

Update #4475

  • Property mode set to 100755
File size: 44.7 KB
Line 
1#include <machine/rtems-bsd-kernel-space.h>
2
3/*-
4 * SPDX-License-Identifier: BSD-4-Clause
5 *
6 * Copyright (c) 1996 John S. Dyson
7 * Copyright (c) 2012 Giovanni Trematerra
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice immediately at the beginning of the file, without modification,
15 *    this list of conditions, and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Absolutely no warranty of function or purpose is made by the author
20 *    John S. Dyson.
21 * 4. Modifications may be freely made to this file if the above conditions
22 *    are met.
23 */
24
25/*
26 * This file contains a high-performance replacement for the socket-based
27 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
28 * all features of sockets, but does do everything that pipes normally
29 * do.
30 */
31
32/*
33 * This code has two modes of operation, a small write mode and a large
34 * write mode.  The small write mode acts like conventional pipes with
35 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
36 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
37 * and PIPE_SIZE in size, the sending process pins the underlying pages in
38 * memory, and the receiving process copies directly from these pinned pages
39 * in the sending process.
40 *
41 * If the sending process receives a signal, it is possible that it will
42 * go away, and certainly its address space can change, because control
43 * is returned back to the user-mode side.  In that case, the pipe code
44 * arranges to copy the buffer supplied by the user process, to a pageable
45 * kernel buffer, and the receiving process will grab the data from the
46 * pageable kernel buffer.  Since signals don't happen all that often,
47 * the copy operation is normally eliminated.
48 *
49 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
50 * happen for small transfers so that the system will not spend all of
51 * its time context switching.
52 *
53 * In order to limit the resource use of pipes, two sysctls exist:
54 *
55 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
56 * address space available to us in pipe_map. This value is normally
57 * autotuned, but may also be loader tuned.
58 *
59 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
60 * memory in use by pipes.
61 *
62 * Based on how large pipekva is relative to maxpipekva, the following
63 * will happen:
64 *
65 * 0% - 50%:
66 *     New pipes are given 16K of memory backing, pipes may dynamically
67 *     grow to as large as 64K where needed.
68 * 50% - 75%:
69 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
70 *     existing pipes may NOT grow.
71 * 75% - 100%:
72 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
73 *     existing pipes will be shrunk down to 4K whenever possible.
74 *
75 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
76 * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
77 * resize which MUST occur for reverse-direction pipes when they are
78 * first used.
79 *
80 * Additional information about the current state of pipes may be obtained
81 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
82 * and kern.ipc.piperesizefail.
83 *
84 * Locking rules:  There are two locks present here:  A mutex, used via
85 * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
86 * the flag, as mutexes can not persist over uiomove.  The mutex
87 * exists only to guard access to the flag, and is not in itself a
88 * locking mechanism.  Also note that there is only a single mutex for
89 * both directions of a pipe.
90 *
91 * As pipelock() may have to sleep before it can acquire the flag, it
92 * is important to reread all data after a call to pipelock(); everything
93 * in the structure may have changed.
94 */
95
96#include <sys/cdefs.h>
97__FBSDID("$FreeBSD$");
98
99#include <sys/param.h>
100#include <sys/systm.h>
101#include <sys/conf.h>
102#include <sys/fcntl.h>
103#include <sys/file.h>
104#include <sys/filedesc.h>
105#include <sys/filio.h>
106#include <sys/kernel.h>
107#include <sys/lock.h>
108#include <sys/mutex.h>
109#include <sys/ttycom.h>
110#include <sys/stat.h>
111#include <sys/malloc.h>
112#include <sys/poll.h>
113#include <sys/selinfo.h>
114#include <sys/signalvar.h>
115#include <sys/syscallsubr.h>
116#include <sys/sysctl.h>
117#include <sys/sysproto.h>
118#include <sys/pipe.h>
119#include <sys/proc.h>
120#include <sys/vnode.h>
121#include <sys/uio.h>
122#include <sys/user.h>
123#include <sys/event.h>
124
125#include <security/mac/mac_framework.h>
126
127#include <vm/vm.h>
128#include <vm/vm_param.h>
129#include <vm/vm_object.h>
130#include <vm/vm_kern.h>
131#include <vm/vm_extern.h>
132#include <vm/pmap.h>
133#include <vm/vm_map.h>
134#include <vm/vm_page.h>
135#include <vm/uma.h>
136
137/*
138 * Use this define if you want to disable *fancy* VM things.  Expect an
139 * approx 30% decrease in transfer rate.  This could be useful for
140 * NetBSD or OpenBSD.
141 */
142/* #define PIPE_NODIRECT */
143
144#define PIPE_PEER(pipe) \
145        (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
146
147/*
148 * interfaces to the outside world
149 */
150static fo_rdwr_t        pipe_read;
151static fo_rdwr_t        pipe_write;
152static fo_truncate_t    pipe_truncate;
153static fo_ioctl_t       pipe_ioctl;
154static fo_poll_t        pipe_poll;
155static fo_kqfilter_t    pipe_kqfilter;
156static fo_stat_t        pipe_stat;
157static fo_close_t       pipe_close;
158static fo_chmod_t       pipe_chmod;
159static fo_chown_t       pipe_chown;
160static fo_fill_kinfo_t  pipe_fill_kinfo;
161
162struct fileops pipeops = {
163        .fo_read = pipe_read,
164        .fo_write = pipe_write,
165        .fo_truncate = pipe_truncate,
166        .fo_ioctl = pipe_ioctl,
167        .fo_poll = pipe_poll,
168        .fo_kqfilter = pipe_kqfilter,
169        .fo_stat = pipe_stat,
170        .fo_close = pipe_close,
171        .fo_chmod = pipe_chmod,
172        .fo_chown = pipe_chown,
173        .fo_sendfile = invfo_sendfile,
174        .fo_fill_kinfo = pipe_fill_kinfo,
175        .fo_flags = DFLAG_PASSABLE
176};
177
178#ifdef __rtems__
179long    maxpipekva;                     /* Limit on pipe KVA */
180#endif /* __rtems__ */
181
182static void     filt_pipedetach(struct knote *kn);
183static void     filt_pipedetach_notsup(struct knote *kn);
184static int      filt_pipenotsup(struct knote *kn, long hint);
185static int      filt_piperead(struct knote *kn, long hint);
186static int      filt_pipewrite(struct knote *kn, long hint);
187
188static struct filterops pipe_nfiltops = {
189        .f_isfd = 1,
190        .f_detach = filt_pipedetach_notsup,
191        .f_event = filt_pipenotsup
192};
193static struct filterops pipe_rfiltops = {
194        .f_isfd = 1,
195        .f_detach = filt_pipedetach,
196        .f_event = filt_piperead
197};
198static struct filterops pipe_wfiltops = {
199        .f_isfd = 1,
200        .f_detach = filt_pipedetach,
201        .f_event = filt_pipewrite
202};
203
204/*
205 * Default pipe buffer size(s), this can be kind-of large now because pipe
206 * space is pageable.  The pipe code will try to maintain locality of
207 * reference for performance reasons, so small amounts of outstanding I/O
208 * will not wipe the cache.
209 */
210#define MINPIPESIZE (PIPE_SIZE/3)
211#define MAXPIPESIZE (2*PIPE_SIZE/3)
212
213static long amountpipekva;
214static int pipefragretry;
215static int pipeallocfail;
216static int piperesizefail;
217static int piperesizeallowed = 1;
218
219SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
220           &maxpipekva, 0, "Pipe KVA limit");
221SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
222           &amountpipekva, 0, "Pipe KVA usage");
223SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
224          &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
225SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
226          &pipeallocfail, 0, "Pipe allocation failures");
227SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
228          &piperesizefail, 0, "Pipe resize failures");
229SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
230          &piperesizeallowed, 0, "Pipe resizing allowed");
231
232static void pipeinit(void *dummy __unused);
233static void pipeclose(struct pipe *cpipe);
234static void pipe_free_kmem(struct pipe *cpipe);
235static void pipe_create(struct pipe *pipe, int backing);
236static void pipe_paircreate(struct thread *td, struct pipepair **p_pp);
237static __inline int pipelock(struct pipe *cpipe, int catch);
238static __inline void pipeunlock(struct pipe *cpipe);
239#ifndef PIPE_NODIRECT
240static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
241static void pipe_destroy_write_buffer(struct pipe *wpipe);
242static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
243static void pipe_clone_write_buffer(struct pipe *wpipe);
244#endif
245static int pipespace(struct pipe *cpipe, int size);
246static int pipespace_new(struct pipe *cpipe, int size);
247
248static int      pipe_zone_ctor(void *mem, int size, void *arg, int flags);
249static int      pipe_zone_init(void *mem, int size, int flags);
250static void     pipe_zone_fini(void *mem, int size);
251
252static uma_zone_t pipe_zone;
253static struct unrhdr64 pipeino_unr;
254static dev_t pipedev_ino;
255
256#ifndef __rtems__
257SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
258#else /* __rtems__ */
259SYSINIT(vfspip, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
260#endif /* __rtems__ */
261
262static void
263pipeinit(void *dummy __unused)
264{
265
266        pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
267            pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
268            UMA_ALIGN_PTR, 0);
269        KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
270        new_unrhdr64(&pipeino_unr, 1);
271        pipedev_ino = devfs_alloc_cdp_inode();
272        KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
273}
274
275static int
276pipe_zone_ctor(void *mem, int size, void *arg, int flags)
277{
278        struct pipepair *pp;
279        struct pipe *rpipe, *wpipe;
280
281        KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
282
283        pp = (struct pipepair *)mem;
284
285        /*
286         * We zero both pipe endpoints to make sure all the kmem pointers
287         * are NULL, flag fields are zero'd, etc.  We timestamp both
288         * endpoints with the same time.
289         */
290        rpipe = &pp->pp_rpipe;
291        bzero(rpipe, sizeof(*rpipe));
292        vfs_timestamp(&rpipe->pipe_ctime);
293        rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
294
295        wpipe = &pp->pp_wpipe;
296        bzero(wpipe, sizeof(*wpipe));
297        wpipe->pipe_ctime = rpipe->pipe_ctime;
298        wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
299
300        rpipe->pipe_peer = wpipe;
301        rpipe->pipe_pair = pp;
302        wpipe->pipe_peer = rpipe;
303        wpipe->pipe_pair = pp;
304
305        /*
306         * Mark both endpoints as present; they will later get free'd
307         * one at a time.  When both are free'd, then the whole pair
308         * is released.
309         */
310        rpipe->pipe_present = PIPE_ACTIVE;
311        wpipe->pipe_present = PIPE_ACTIVE;
312
313        /*
314         * Eventually, the MAC Framework may initialize the label
315         * in ctor or init, but for now we do it elswhere to avoid
316         * blocking in ctor or init.
317         */
318        pp->pp_label = NULL;
319
320        return (0);
321}
322
323static int
324pipe_zone_init(void *mem, int size, int flags)
325{
326        struct pipepair *pp;
327
328        KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
329
330        pp = (struct pipepair *)mem;
331
332        mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW);
333        return (0);
334}
335
336static void
337pipe_zone_fini(void *mem, int size)
338{
339        struct pipepair *pp;
340
341        KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
342
343        pp = (struct pipepair *)mem;
344
345        mtx_destroy(&pp->pp_mtx);
346}
347
348static void
349pipe_paircreate(struct thread *td, struct pipepair **p_pp)
350{
351        struct pipepair *pp;
352        struct pipe *rpipe, *wpipe;
353
354        *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
355#ifdef MAC
356        /*
357         * The MAC label is shared between the connected endpoints.  As a
358         * result mac_pipe_init() and mac_pipe_create() are called once
359         * for the pair, and not on the endpoints.
360         */
361        mac_pipe_init(pp);
362        mac_pipe_create(td->td_ucred, pp);
363#endif
364        rpipe = &pp->pp_rpipe;
365        wpipe = &pp->pp_wpipe;
366
367        knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
368        knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
369
370        /* Only the forward direction pipe is backed by default */
371        pipe_create(rpipe, 1);
372        pipe_create(wpipe, 0);
373
374        rpipe->pipe_state |= PIPE_DIRECTOK;
375        wpipe->pipe_state |= PIPE_DIRECTOK;
376}
377
378void
379pipe_named_ctor(struct pipe **ppipe, struct thread *td)
380{
381        struct pipepair *pp;
382
383        pipe_paircreate(td, &pp);
384        pp->pp_rpipe.pipe_state |= PIPE_NAMED;
385        *ppipe = &pp->pp_rpipe;
386}
387
388void
389pipe_dtor(struct pipe *dpipe)
390{
391        struct pipe *peer;
392        ino_t ino;
393
394        ino = dpipe->pipe_ino;
395        peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
396        funsetown(&dpipe->pipe_sigio);
397        pipeclose(dpipe);
398        if (peer != NULL) {
399                funsetown(&peer->pipe_sigio);
400                pipeclose(peer);
401        }
402}
403
404/*
405 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
406 * the zone pick up the pieces via pipeclose().
407 */
408int
409kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1,
410    struct filecaps *fcaps2)
411{
412        struct file *rf, *wf;
413        struct pipe *rpipe, *wpipe;
414        struct pipepair *pp;
415        int fd, fflags, error;
416
417        pipe_paircreate(td, &pp);
418        rpipe = &pp->pp_rpipe;
419        wpipe = &pp->pp_wpipe;
420        error = falloc_caps(td, &rf, &fd, flags, fcaps1);
421        if (error) {
422                pipeclose(rpipe);
423                pipeclose(wpipe);
424                return (error);
425        }
426        /* An extra reference on `rf' has been held for us by falloc_caps(). */
427        fildes[0] = fd;
428
429        fflags = FREAD | FWRITE;
430        if ((flags & O_NONBLOCK) != 0)
431                fflags |= FNONBLOCK;
432
433        /*
434         * Warning: once we've gotten past allocation of the fd for the
435         * read-side, we can only drop the read side via fdrop() in order
436         * to avoid races against processes which manage to dup() the read
437         * side while we are blocked trying to allocate the write side.
438         */
439        finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
440        error = falloc_caps(td, &wf, &fd, flags, fcaps2);
441        if (error) {
442                fdclose(td, rf, fildes[0]);
443                fdrop(rf, td);
444                /* rpipe has been closed by fdrop(). */
445                pipeclose(wpipe);
446                return (error);
447        }
448        /* An extra reference on `wf' has been held for us by falloc_caps(). */
449        finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
450        fdrop(wf, td);
451        fildes[1] = fd;
452        fdrop(rf, td);
453
454        return (0);
455}
456
457#ifdef COMPAT_FREEBSD10
458/* ARGSUSED */
459int
460freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused)
461{
462        int error;
463        int fildes[2];
464
465        error = kern_pipe(td, fildes, 0, NULL, NULL);
466        if (error)
467                return (error);
468
469        td->td_retval[0] = fildes[0];
470        td->td_retval[1] = fildes[1];
471
472        return (0);
473}
474#endif
475
476int
477sys_pipe2(struct thread *td, struct pipe2_args *uap)
478{
479        int error, fildes[2];
480
481        if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
482                return (EINVAL);
483        error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
484        if (error)
485                return (error);
486        error = copyout(fildes, uap->fildes, 2 * sizeof(int));
487        if (error) {
488                (void)kern_close(td, fildes[0]);
489                (void)kern_close(td, fildes[1]);
490        }
491        return (error);
492}
493
494
495/*
496 * Allocate kva for pipe circular buffer, the space is pageable
497 * This routine will 'realloc' the size of a pipe safely, if it fails
498 * it will retain the old buffer.
499 * If it fails it will return ENOMEM.
500 */
501static int
502pipespace_new(struct pipe *cpipe, int size)
503{
504        caddr_t buffer;
505        int error, cnt, firstseg;
506        static int curfail = 0;
507        static struct timeval lastfail;
508
509        KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
510        KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
511                ("pipespace: resize of direct writes not allowed"));
512retry:
513        cnt = cpipe->pipe_buffer.cnt;
514        if (cnt > size)
515                size = cnt;
516
517        size = round_page(size);
518#ifndef __rtems__
519        buffer = (caddr_t) vm_map_min(pipe_map);
520
521        error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0,
522            VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
523        if (error != KERN_SUCCESS) {
524#else /* __rtems__ */
525        (void)error;
526        buffer = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
527        if (buffer == NULL) {
528#endif /* __rtems__ */
529                if ((cpipe->pipe_buffer.buffer == NULL) &&
530                        (size > SMALL_PIPE_SIZE)) {
531                        size = SMALL_PIPE_SIZE;
532                        pipefragretry++;
533                        goto retry;
534                }
535                if (cpipe->pipe_buffer.buffer == NULL) {
536                        pipeallocfail++;
537                        if (ppsratecheck(&lastfail, &curfail, 1))
538                                printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
539                } else {
540                        piperesizefail++;
541                }
542                return (ENOMEM);
543        }
544
545        /* copy data, then free old resources if we're resizing */
546        if (cnt > 0) {
547                if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
548                        firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
549                        bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
550                                buffer, firstseg);
551                        if ((cnt - firstseg) > 0)
552                                bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
553                                        cpipe->pipe_buffer.in);
554                } else {
555                        bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
556                                buffer, cnt);
557                }
558        }
559        pipe_free_kmem(cpipe);
560        cpipe->pipe_buffer.buffer = buffer;
561        cpipe->pipe_buffer.size = size;
562        cpipe->pipe_buffer.in = cnt;
563        cpipe->pipe_buffer.out = 0;
564        cpipe->pipe_buffer.cnt = cnt;
565        atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
566        return (0);
567}
568
569/*
570 * Wrapper for pipespace_new() that performs locking assertions.
571 */
572static int
573pipespace(struct pipe *cpipe, int size)
574{
575
576        KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
577                ("Unlocked pipe passed to pipespace"));
578        return (pipespace_new(cpipe, size));
579}
580
581/*
582 * lock a pipe for I/O, blocking other access
583 */
584static __inline int
585pipelock(struct pipe *cpipe, int catch)
586{
587        int error;
588
589        PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
590        while (cpipe->pipe_state & PIPE_LOCKFL) {
591                cpipe->pipe_state |= PIPE_LWANT;
592                error = msleep(cpipe, PIPE_MTX(cpipe),
593                    catch ? (PRIBIO | PCATCH) : PRIBIO,
594                    "pipelk", 0);
595                if (error != 0)
596                        return (error);
597        }
598        cpipe->pipe_state |= PIPE_LOCKFL;
599        return (0);
600}
601
602/*
603 * unlock a pipe I/O lock
604 */
605static __inline void
606pipeunlock(struct pipe *cpipe)
607{
608
609        PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
610        KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
611                ("Unlocked pipe passed to pipeunlock"));
612        cpipe->pipe_state &= ~PIPE_LOCKFL;
613        if (cpipe->pipe_state & PIPE_LWANT) {
614                cpipe->pipe_state &= ~PIPE_LWANT;
615                wakeup(cpipe);
616        }
617}
618
619void
620pipeselwakeup(struct pipe *cpipe)
621{
622
623        PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
624        if (cpipe->pipe_state & PIPE_SEL) {
625                selwakeuppri(&cpipe->pipe_sel, PSOCK);
626                if (!SEL_WAITING(&cpipe->pipe_sel))
627                        cpipe->pipe_state &= ~PIPE_SEL;
628        }
629#ifndef __rtems__
630        if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
631                pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
632#endif /* __rtems__ */
633        KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
634}
635
636/*
637 * Initialize and allocate VM and memory for pipe.  The structure
638 * will start out zero'd from the ctor, so we just manage the kmem.
639 */
640static void
641pipe_create(struct pipe *pipe, int backing)
642{
643
644        if (backing) {
645                /*
646                 * Note that these functions can fail if pipe map is exhausted
647                 * (as a result of too many pipes created), but we ignore the
648                 * error as it is not fatal and could be provoked by
649                 * unprivileged users. The only consequence is worse performance
650                 * with given pipe.
651                 */
652                if (amountpipekva > maxpipekva / 2)
653                        (void)pipespace_new(pipe, SMALL_PIPE_SIZE);
654                else
655                        (void)pipespace_new(pipe, PIPE_SIZE);
656        }
657
658        pipe->pipe_ino = alloc_unr64(&pipeino_unr);
659}
660
661/* ARGSUSED */
662static int
663pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
664    int flags, struct thread *td)
665{
666        struct pipe *rpipe;
667        int error;
668        int nread = 0;
669        int size;
670
671        rpipe = fp->f_data;
672        PIPE_LOCK(rpipe);
673        ++rpipe->pipe_busy;
674        error = pipelock(rpipe, 1);
675        if (error)
676                goto unlocked_error;
677
678#ifdef MAC
679        error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
680        if (error)
681                goto locked_error;
682#endif
683        if (amountpipekva > (3 * maxpipekva) / 4) {
684                if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
685                        (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
686                        (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
687                        (piperesizeallowed == 1)) {
688                        PIPE_UNLOCK(rpipe);
689                        pipespace(rpipe, SMALL_PIPE_SIZE);
690                        PIPE_LOCK(rpipe);
691                }
692        }
693
694        while (uio->uio_resid) {
695                /*
696                 * normal pipe buffer receive
697                 */
698                if (rpipe->pipe_buffer.cnt > 0) {
699                        size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
700                        if (size > rpipe->pipe_buffer.cnt)
701                                size = rpipe->pipe_buffer.cnt;
702                        if (size > uio->uio_resid)
703                                size = uio->uio_resid;
704
705                        PIPE_UNLOCK(rpipe);
706                        error = uiomove(
707                            &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
708                            size, uio);
709                        PIPE_LOCK(rpipe);
710                        if (error)
711                                break;
712
713                        rpipe->pipe_buffer.out += size;
714                        if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
715                                rpipe->pipe_buffer.out = 0;
716
717                        rpipe->pipe_buffer.cnt -= size;
718
719                        /*
720                         * If there is no more to read in the pipe, reset
721                         * its pointers to the beginning.  This improves
722                         * cache hit stats.
723                         */
724                        if (rpipe->pipe_buffer.cnt == 0) {
725                                rpipe->pipe_buffer.in = 0;
726                                rpipe->pipe_buffer.out = 0;
727                        }
728                        nread += size;
729#ifndef PIPE_NODIRECT
730                /*
731                 * Direct copy, bypassing a kernel buffer.
732                 */
733                } else if ((size = rpipe->pipe_map.cnt) != 0) {
734                        if (size > uio->uio_resid)
735                                size = (u_int) uio->uio_resid;
736                        PIPE_UNLOCK(rpipe);
737                        error = uiomove_fromphys(rpipe->pipe_map.ms,
738                            rpipe->pipe_map.pos, size, uio);
739                        PIPE_LOCK(rpipe);
740                        if (error)
741                                break;
742                        nread += size;
743                        rpipe->pipe_map.pos += size;
744                        rpipe->pipe_map.cnt -= size;
745                        if (rpipe->pipe_map.cnt == 0) {
746                                rpipe->pipe_state &= ~PIPE_WANTW;
747                                wakeup(rpipe);
748                        }
749#endif
750                } else {
751                        /*
752                         * detect EOF condition
753                         * read returns 0 on EOF, no need to set error
754                         */
755                        if (rpipe->pipe_state & PIPE_EOF)
756                                break;
757
758                        /*
759                         * If the "write-side" has been blocked, wake it up now.
760                         */
761                        if (rpipe->pipe_state & PIPE_WANTW) {
762                                rpipe->pipe_state &= ~PIPE_WANTW;
763                                wakeup(rpipe);
764                        }
765
766                        /*
767                         * Break if some data was read.
768                         */
769                        if (nread > 0)
770                                break;
771
772                        /*
773                         * Unlock the pipe buffer for our remaining processing.
774                         * We will either break out with an error or we will
775                         * sleep and relock to loop.
776                         */
777                        pipeunlock(rpipe);
778
779                        /*
780                         * Handle non-blocking mode operation or
781                         * wait for more data.
782                         */
783                        if (fp->f_flag & FNONBLOCK) {
784                                error = EAGAIN;
785                        } else {
786                                rpipe->pipe_state |= PIPE_WANTR;
787                                if ((error = msleep(rpipe, PIPE_MTX(rpipe),
788                                    PRIBIO | PCATCH,
789                                    "piperd", 0)) == 0)
790                                        error = pipelock(rpipe, 1);
791                        }
792                        if (error)
793                                goto unlocked_error;
794                }
795        }
796#ifdef MAC
797locked_error:
798#endif
799        pipeunlock(rpipe);
800
801        /* XXX: should probably do this before getting any locks. */
802        if (error == 0)
803                vfs_timestamp(&rpipe->pipe_atime);
804unlocked_error:
805        --rpipe->pipe_busy;
806
807        /*
808         * PIPE_WANT processing only makes sense if pipe_busy is 0.
809         */
810        if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
811                rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
812                wakeup(rpipe);
813        } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
814                /*
815                 * Handle write blocking hysteresis.
816                 */
817                if (rpipe->pipe_state & PIPE_WANTW) {
818                        rpipe->pipe_state &= ~PIPE_WANTW;
819                        wakeup(rpipe);
820                }
821        }
822
823        if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
824                pipeselwakeup(rpipe);
825
826        PIPE_UNLOCK(rpipe);
827        return (error);
828}
829
830#ifndef PIPE_NODIRECT
831/*
832 * Map the sending processes' buffer into kernel space and wire it.
833 * This is similar to a physical write operation.
834 */
835static int
836pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio)
837{
838        u_int size;
839        int i;
840
841        PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
842        KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
843            ("%s: PIPE_DIRECTW set on %p", __func__, wpipe));
844        KASSERT(wpipe->pipe_map.cnt == 0,
845            ("%s: pipe map for %p contains residual data", __func__, wpipe));
846
847        if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
848                size = wpipe->pipe_buffer.size;
849        else
850                size = uio->uio_iov->iov_len;
851
852        wpipe->pipe_state |= PIPE_DIRECTW;
853        PIPE_UNLOCK(wpipe);
854        i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
855            (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
856            wpipe->pipe_map.ms, PIPENPAGES);
857        PIPE_LOCK(wpipe);
858        if (i < 0) {
859                wpipe->pipe_state &= ~PIPE_DIRECTW;
860                return (EFAULT);
861        }
862
863        wpipe->pipe_map.npages = i;
864        wpipe->pipe_map.pos =
865            ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
866        wpipe->pipe_map.cnt = size;
867
868        uio->uio_iov->iov_len -= size;
869        uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
870        if (uio->uio_iov->iov_len == 0)
871                uio->uio_iov++;
872        uio->uio_resid -= size;
873        uio->uio_offset += size;
874        return (0);
875}
876
877/*
878 * Unwire the process buffer.
879 */
880static void
881pipe_destroy_write_buffer(struct pipe *wpipe)
882{
883
884        PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
885        KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
886            ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
887        KASSERT(wpipe->pipe_map.cnt == 0,
888            ("%s: pipe map for %p contains residual data", __func__, wpipe));
889
890        wpipe->pipe_state &= ~PIPE_DIRECTW;
891        vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
892        wpipe->pipe_map.npages = 0;
893}
894
895/*
896 * In the case of a signal, the writing process might go away.  This
897 * code copies the data into the circular buffer so that the source
898 * pages can be freed without loss of data.
899 */
900static void
901pipe_clone_write_buffer(struct pipe *wpipe)
902{
903        struct uio uio;
904        struct iovec iov;
905        int size;
906        int pos;
907
908        PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
909        KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
910            ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
911
912        size = wpipe->pipe_map.cnt;
913        pos = wpipe->pipe_map.pos;
914        wpipe->pipe_map.cnt = 0;
915
916        wpipe->pipe_buffer.in = size;
917        wpipe->pipe_buffer.out = 0;
918        wpipe->pipe_buffer.cnt = size;
919
920        PIPE_UNLOCK(wpipe);
921        iov.iov_base = wpipe->pipe_buffer.buffer;
922        iov.iov_len = size;
923        uio.uio_iov = &iov;
924        uio.uio_iovcnt = 1;
925        uio.uio_offset = 0;
926        uio.uio_resid = size;
927        uio.uio_segflg = UIO_SYSSPACE;
928        uio.uio_rw = UIO_READ;
929        uio.uio_td = curthread;
930        uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
931        PIPE_LOCK(wpipe);
932        pipe_destroy_write_buffer(wpipe);
933}
934
935/*
936 * This implements the pipe buffer write mechanism.  Note that only
937 * a direct write OR a normal pipe write can be pending at any given time.
938 * If there are any characters in the pipe buffer, the direct write will
939 * be deferred until the receiving process grabs all of the bytes from
940 * the pipe buffer.  Then the direct mapping write is set-up.
941 */
942static int
943pipe_direct_write(struct pipe *wpipe, struct uio *uio)
944{
945        int error;
946
947retry:
948        PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
949        error = pipelock(wpipe, 1);
950        if (error != 0)
951                goto error1;
952        if ((wpipe->pipe_state & PIPE_EOF) != 0) {
953                error = EPIPE;
954                pipeunlock(wpipe);
955                goto error1;
956        }
957        if (wpipe->pipe_state & PIPE_DIRECTW) {
958                if (wpipe->pipe_state & PIPE_WANTR) {
959                        wpipe->pipe_state &= ~PIPE_WANTR;
960                        wakeup(wpipe);
961                }
962                pipeselwakeup(wpipe);
963                wpipe->pipe_state |= PIPE_WANTW;
964                pipeunlock(wpipe);
965                error = msleep(wpipe, PIPE_MTX(wpipe),
966                    PRIBIO | PCATCH, "pipdww", 0);
967                if (error)
968                        goto error1;
969                else
970                        goto retry;
971        }
972        if (wpipe->pipe_buffer.cnt > 0) {
973                if (wpipe->pipe_state & PIPE_WANTR) {
974                        wpipe->pipe_state &= ~PIPE_WANTR;
975                        wakeup(wpipe);
976                }
977                pipeselwakeup(wpipe);
978                wpipe->pipe_state |= PIPE_WANTW;
979                pipeunlock(wpipe);
980                error = msleep(wpipe, PIPE_MTX(wpipe),
981                    PRIBIO | PCATCH, "pipdwc", 0);
982                if (error)
983                        goto error1;
984                else
985                        goto retry;
986        }
987
988        error = pipe_build_write_buffer(wpipe, uio);
989        if (error) {
990                pipeunlock(wpipe);
991                goto error1;
992        }
993
994        while (wpipe->pipe_map.cnt != 0 &&
995            (wpipe->pipe_state & PIPE_EOF) == 0) {
996                if (wpipe->pipe_state & PIPE_WANTR) {
997                        wpipe->pipe_state &= ~PIPE_WANTR;
998                        wakeup(wpipe);
999                }
1000                pipeselwakeup(wpipe);
1001                wpipe->pipe_state |= PIPE_WANTW;
1002                pipeunlock(wpipe);
1003                error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
1004                    "pipdwt", 0);
1005                pipelock(wpipe, 0);
1006                if (error != 0)
1007                        break;
1008        }
1009
1010        if ((wpipe->pipe_state & PIPE_EOF) != 0) {
1011                wpipe->pipe_map.cnt = 0;
1012                pipe_destroy_write_buffer(wpipe);
1013                pipeselwakeup(wpipe);
1014                error = EPIPE;
1015        } else if (error == EINTR || error == ERESTART) {
1016                pipe_clone_write_buffer(wpipe);
1017        } else {
1018                pipe_destroy_write_buffer(wpipe);
1019        }
1020        pipeunlock(wpipe);
1021        KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
1022            ("pipe %p leaked PIPE_DIRECTW", wpipe));
1023        return (error);
1024
1025error1:
1026        wakeup(wpipe);
1027        return (error);
1028}
1029#endif
1030
1031static int
1032pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1033    int flags, struct thread *td)
1034{
1035        int error = 0;
1036        int desiredsize;
1037        ssize_t orig_resid;
1038        struct pipe *wpipe, *rpipe;
1039
1040        rpipe = fp->f_data;
1041        wpipe = PIPE_PEER(rpipe);
1042        PIPE_LOCK(rpipe);
1043        error = pipelock(wpipe, 1);
1044        if (error) {
1045                PIPE_UNLOCK(rpipe);
1046                return (error);
1047        }
1048        /*
1049         * detect loss of pipe read side, issue SIGPIPE if lost.
1050         */
1051        if (wpipe->pipe_present != PIPE_ACTIVE ||
1052            (wpipe->pipe_state & PIPE_EOF)) {
1053                pipeunlock(wpipe);
1054                PIPE_UNLOCK(rpipe);
1055                return (EPIPE);
1056        }
1057#ifdef MAC
1058        error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
1059        if (error) {
1060                pipeunlock(wpipe);
1061                PIPE_UNLOCK(rpipe);
1062                return (error);
1063        }
1064#endif
1065        ++wpipe->pipe_busy;
1066
1067        /* Choose a larger size if it's advantageous */
1068        desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1069        while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1070                if (piperesizeallowed != 1)
1071                        break;
1072                if (amountpipekva > maxpipekva / 2)
1073                        break;
1074                if (desiredsize == BIG_PIPE_SIZE)
1075                        break;
1076                desiredsize = desiredsize * 2;
1077        }
1078
1079        /* Choose a smaller size if we're in a OOM situation */
1080        if ((amountpipekva > (3 * maxpipekva) / 4) &&
1081                (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1082                (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1083                (piperesizeallowed == 1))
1084                desiredsize = SMALL_PIPE_SIZE;
1085
1086        /* Resize if the above determined that a new size was necessary */
1087        if ((desiredsize != wpipe->pipe_buffer.size) &&
1088                ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1089                PIPE_UNLOCK(wpipe);
1090                pipespace(wpipe, desiredsize);
1091                PIPE_LOCK(wpipe);
1092        }
1093        if (wpipe->pipe_buffer.size == 0) {
1094                /*
1095                 * This can only happen for reverse direction use of pipes
1096                 * in a complete OOM situation.
1097                 */
1098                error = ENOMEM;
1099                --wpipe->pipe_busy;
1100                pipeunlock(wpipe);
1101                PIPE_UNLOCK(wpipe);
1102                return (error);
1103        }
1104
1105        pipeunlock(wpipe);
1106
1107        orig_resid = uio->uio_resid;
1108
1109        while (uio->uio_resid) {
1110                int space;
1111
1112                pipelock(wpipe, 0);
1113                if (wpipe->pipe_state & PIPE_EOF) {
1114                        pipeunlock(wpipe);
1115                        error = EPIPE;
1116                        break;
1117                }
1118#ifndef PIPE_NODIRECT
1119                /*
1120                 * If the transfer is large, we can gain performance if
1121                 * we do process-to-process copies directly.
1122                 * If the write is non-blocking, we don't use the
1123                 * direct write mechanism.
1124                 *
1125                 * The direct write mechanism will detect the reader going
1126                 * away on us.
1127                 */
1128                if (uio->uio_segflg == UIO_USERSPACE &&
1129                    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
1130                    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
1131                    (fp->f_flag & FNONBLOCK) == 0) {
1132                        pipeunlock(wpipe);
1133                        error = pipe_direct_write(wpipe, uio);
1134                        if (error)
1135                                break;
1136                        continue;
1137                }
1138#endif
1139
1140                /*
1141                 * Pipe buffered writes cannot be coincidental with
1142                 * direct writes.  We wait until the currently executing
1143                 * direct write is completed before we start filling the
1144                 * pipe buffer.  We break out if a signal occurs or the
1145                 * reader goes away.
1146                 */
1147                if (wpipe->pipe_map.cnt != 0) {
1148                        if (wpipe->pipe_state & PIPE_WANTR) {
1149                                wpipe->pipe_state &= ~PIPE_WANTR;
1150                                wakeup(wpipe);
1151                        }
1152                        pipeselwakeup(wpipe);
1153                        wpipe->pipe_state |= PIPE_WANTW;
1154                        pipeunlock(wpipe);
1155                        error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1156                            "pipbww", 0);
1157                        if (error)
1158                                break;
1159                        else
1160                                continue;
1161                }
1162
1163                space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1164
1165                /* Writes of size <= PIPE_BUF must be atomic. */
1166                if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1167                        space = 0;
1168
1169                if (space > 0) {
1170                        int size;       /* Transfer size */
1171                        int segsize;    /* first segment to transfer */
1172
1173                        /*
1174                         * Transfer size is minimum of uio transfer
1175                         * and free space in pipe buffer.
1176                         */
1177                        if (space > uio->uio_resid)
1178                                size = uio->uio_resid;
1179                        else
1180                                size = space;
1181                        /*
1182                         * First segment to transfer is minimum of
1183                         * transfer size and contiguous space in
1184                         * pipe buffer.  If first segment to transfer
1185                         * is less than the transfer size, we've got
1186                         * a wraparound in the buffer.
1187                         */
1188                        segsize = wpipe->pipe_buffer.size -
1189                                wpipe->pipe_buffer.in;
1190                        if (segsize > size)
1191                                segsize = size;
1192
1193                        /* Transfer first segment */
1194
1195                        PIPE_UNLOCK(rpipe);
1196                        error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1197                                        segsize, uio);
1198                        PIPE_LOCK(rpipe);
1199
1200                        if (error == 0 && segsize < size) {
1201                                KASSERT(wpipe->pipe_buffer.in + segsize ==
1202                                        wpipe->pipe_buffer.size,
1203                                        ("Pipe buffer wraparound disappeared"));
1204                                /*
1205                                 * Transfer remaining part now, to
1206                                 * support atomic writes.  Wraparound
1207                                 * happened.
1208                                 */
1209
1210                                PIPE_UNLOCK(rpipe);
1211                                error = uiomove(
1212                                    &wpipe->pipe_buffer.buffer[0],
1213                                    size - segsize, uio);
1214                                PIPE_LOCK(rpipe);
1215                        }
1216                        if (error == 0) {
1217                                wpipe->pipe_buffer.in += size;
1218                                if (wpipe->pipe_buffer.in >=
1219                                    wpipe->pipe_buffer.size) {
1220                                        KASSERT(wpipe->pipe_buffer.in ==
1221                                                size - segsize +
1222                                                wpipe->pipe_buffer.size,
1223                                                ("Expected wraparound bad"));
1224                                        wpipe->pipe_buffer.in = size - segsize;
1225                                }
1226
1227                                wpipe->pipe_buffer.cnt += size;
1228                                KASSERT(wpipe->pipe_buffer.cnt <=
1229                                        wpipe->pipe_buffer.size,
1230                                        ("Pipe buffer overflow"));
1231                        }
1232                        pipeunlock(wpipe);
1233                        if (error != 0)
1234                                break;
1235                } else {
1236                        /*
1237                         * If the "read-side" has been blocked, wake it up now.
1238                         */
1239                        if (wpipe->pipe_state & PIPE_WANTR) {
1240                                wpipe->pipe_state &= ~PIPE_WANTR;
1241                                wakeup(wpipe);
1242                        }
1243
1244                        /*
1245                         * don't block on non-blocking I/O
1246                         */
1247                        if (fp->f_flag & FNONBLOCK) {
1248                                error = EAGAIN;
1249                                pipeunlock(wpipe);
1250                                break;
1251                        }
1252
1253                        /*
1254                         * We have no more space and have something to offer,
1255                         * wake up select/poll.
1256                         */
1257                        pipeselwakeup(wpipe);
1258
1259                        wpipe->pipe_state |= PIPE_WANTW;
1260                        pipeunlock(wpipe);
1261                        error = msleep(wpipe, PIPE_MTX(rpipe),
1262                            PRIBIO | PCATCH, "pipewr", 0);
1263                        if (error != 0)
1264                                break;
1265                }
1266        }
1267
1268        pipelock(wpipe, 0);
1269        --wpipe->pipe_busy;
1270
1271        if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1272                wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1273                wakeup(wpipe);
1274        } else if (wpipe->pipe_buffer.cnt > 0) {
1275                /*
1276                 * If we have put any characters in the buffer, we wake up
1277                 * the reader.
1278                 */
1279                if (wpipe->pipe_state & PIPE_WANTR) {
1280                        wpipe->pipe_state &= ~PIPE_WANTR;
1281                        wakeup(wpipe);
1282                }
1283        }
1284
1285        /*
1286         * Don't return EPIPE if any byte was written.
1287         * EINTR and other interrupts are handled by generic I/O layer.
1288         * Do not pretend that I/O succeeded for obvious user error
1289         * like EFAULT.
1290         */
1291        if (uio->uio_resid != orig_resid && error == EPIPE)
1292                error = 0;
1293
1294        if (error == 0)
1295                vfs_timestamp(&wpipe->pipe_mtime);
1296
1297        /*
1298         * We have something to offer,
1299         * wake up select/poll.
1300         */
1301        if (wpipe->pipe_buffer.cnt)
1302                pipeselwakeup(wpipe);
1303
1304        pipeunlock(wpipe);
1305        PIPE_UNLOCK(rpipe);
1306        return (error);
1307}
1308
1309/* ARGSUSED */
1310static int
1311pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1312    struct thread *td)
1313{
1314        struct pipe *cpipe;
1315        int error;
1316
1317        cpipe = fp->f_data;
1318        if (cpipe->pipe_state & PIPE_NAMED)
1319                error = vnops.fo_truncate(fp, length, active_cred, td);
1320        else
1321                error = invfo_truncate(fp, length, active_cred, td);
1322        return (error);
1323}
1324
1325/*
1326 * we implement a very minimal set of ioctls for compatibility with sockets.
1327 */
1328static int
1329pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
1330    struct thread *td)
1331{
1332        struct pipe *mpipe = fp->f_data;
1333        int error;
1334
1335        PIPE_LOCK(mpipe);
1336
1337#ifdef MAC
1338        error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1339        if (error) {
1340                PIPE_UNLOCK(mpipe);
1341                return (error);
1342        }
1343#endif
1344
1345        error = 0;
1346        switch (cmd) {
1347
1348        case FIONBIO:
1349                break;
1350
1351        case FIOASYNC:
1352                if (*(int *)data) {
1353                        mpipe->pipe_state |= PIPE_ASYNC;
1354                } else {
1355                        mpipe->pipe_state &= ~PIPE_ASYNC;
1356                }
1357                break;
1358
1359        case FIONREAD:
1360                if (!(fp->f_flag & FREAD)) {
1361                        *(int *)data = 0;
1362                        PIPE_UNLOCK(mpipe);
1363                        return (0);
1364                }
1365                if (mpipe->pipe_map.cnt != 0)
1366                        *(int *)data = mpipe->pipe_map.cnt;
1367                else
1368                        *(int *)data = mpipe->pipe_buffer.cnt;
1369                break;
1370
1371        case FIOSETOWN:
1372                PIPE_UNLOCK(mpipe);
1373                error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1374                goto out_unlocked;
1375
1376        case FIOGETOWN:
1377                *(int *)data = fgetown(&mpipe->pipe_sigio);
1378                break;
1379
1380        /* This is deprecated, FIOSETOWN should be used instead. */
1381        case TIOCSPGRP:
1382                PIPE_UNLOCK(mpipe);
1383                error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1384                goto out_unlocked;
1385
1386        /* This is deprecated, FIOGETOWN should be used instead. */
1387        case TIOCGPGRP:
1388                *(int *)data = -fgetown(&mpipe->pipe_sigio);
1389                break;
1390
1391        default:
1392                error = ENOTTY;
1393                break;
1394        }
1395        PIPE_UNLOCK(mpipe);
1396out_unlocked:
1397        return (error);
1398}
1399
1400static int
1401pipe_poll(struct file *fp, int events, struct ucred *active_cred,
1402    struct thread *td)
1403{
1404        struct pipe *rpipe;
1405        struct pipe *wpipe;
1406        int levents, revents;
1407#ifdef MAC
1408        int error;
1409#endif
1410
1411        revents = 0;
1412        rpipe = fp->f_data;
1413        wpipe = PIPE_PEER(rpipe);
1414        PIPE_LOCK(rpipe);
1415#ifdef MAC
1416        error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1417        if (error)
1418                goto locked_error;
1419#endif
1420        if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
1421                if (rpipe->pipe_map.cnt > 0 || rpipe->pipe_buffer.cnt > 0)
1422                        revents |= events & (POLLIN | POLLRDNORM);
1423
1424        if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
1425                if (wpipe->pipe_present != PIPE_ACTIVE ||
1426                    (wpipe->pipe_state & PIPE_EOF) ||
1427                    ((wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1428                     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
1429                         wpipe->pipe_buffer.size == 0)))
1430                        revents |= events & (POLLOUT | POLLWRNORM);
1431
1432        levents = events &
1433            (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
1434        if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents &&
1435            fp->f_pipegen == rpipe->pipe_wgen)
1436                events |= POLLINIGNEOF;
1437
1438        if ((events & POLLINIGNEOF) == 0) {
1439                if (rpipe->pipe_state & PIPE_EOF) {
1440                        revents |= (events & (POLLIN | POLLRDNORM));
1441                        if (wpipe->pipe_present != PIPE_ACTIVE ||
1442                            (wpipe->pipe_state & PIPE_EOF))
1443                                revents |= POLLHUP;
1444                }
1445        }
1446
1447        if (revents == 0) {
1448                if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) {
1449                        selrecord(td, &rpipe->pipe_sel);
1450                        if (SEL_WAITING(&rpipe->pipe_sel))
1451                                rpipe->pipe_state |= PIPE_SEL;
1452                }
1453
1454                if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) {
1455                        selrecord(td, &wpipe->pipe_sel);
1456                        if (SEL_WAITING(&wpipe->pipe_sel))
1457                                wpipe->pipe_state |= PIPE_SEL;
1458                }
1459        }
1460#ifdef MAC
1461locked_error:
1462#endif
1463        PIPE_UNLOCK(rpipe);
1464
1465        return (revents);
1466}
1467
1468/*
1469 * We shouldn't need locks here as we're doing a read and this should
1470 * be a natural race.
1471 */
1472static int
1473pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
1474    struct thread *td)
1475{
1476        struct pipe *pipe;
1477#ifdef MAC
1478        int error;
1479#endif
1480
1481        pipe = fp->f_data;
1482        PIPE_LOCK(pipe);
1483#ifdef MAC
1484        error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1485        if (error) {
1486                PIPE_UNLOCK(pipe);
1487                return (error);
1488        }
1489#endif
1490
1491        /* For named pipes ask the underlying filesystem. */
1492        if (pipe->pipe_state & PIPE_NAMED) {
1493                PIPE_UNLOCK(pipe);
1494                return (vnops.fo_stat(fp, ub, active_cred, td));
1495        }
1496
1497        PIPE_UNLOCK(pipe);
1498
1499        bzero(ub, sizeof(*ub));
1500        ub->st_mode = S_IFIFO;
1501        ub->st_blksize = PAGE_SIZE;
1502        if (pipe->pipe_map.cnt != 0)
1503                ub->st_size = pipe->pipe_map.cnt;
1504        else
1505                ub->st_size = pipe->pipe_buffer.cnt;
1506        ub->st_blocks = howmany(ub->st_size, ub->st_blksize);
1507        ub->st_atim = pipe->pipe_atime;
1508        ub->st_mtim = pipe->pipe_mtime;
1509        ub->st_ctim = pipe->pipe_ctime;
1510#ifndef __rtems__
1511        ub->st_uid = fp->f_cred->cr_uid;
1512        ub->st_gid = fp->f_cred->cr_gid;
1513        ub->st_dev = pipedev_ino;
1514        ub->st_ino = pipe->pipe_ino;
1515#else /* __rtems__ */
1516        ub->st_uid = BSD_DEFAULT_UID;
1517        ub->st_gid = BSD_DEFAULT_GID;
1518        ub->st_dev = rtems_filesystem_make_dev_t(0xcc494cd6U, 0x1d970b4dU);
1519        ub->st_ino = pipe->pipe_ino;
1520#endif /* __rtems__ */
1521        /*
1522         * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
1523         */
1524        return (0);
1525}
1526
1527/* ARGSUSED */
1528static int
1529pipe_close(struct file *fp, struct thread *td)
1530{
1531
1532        if (fp->f_vnode != NULL)
1533                return vnops.fo_close(fp, td);
1534        fp->f_ops = &badfileops;
1535        pipe_dtor(fp->f_data);
1536        fp->f_data = NULL;
1537        return (0);
1538}
1539
1540static int
1541pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
1542{
1543        struct pipe *cpipe;
1544        int error;
1545
1546        cpipe = fp->f_data;
1547        if (cpipe->pipe_state & PIPE_NAMED)
1548                error = vn_chmod(fp, mode, active_cred, td);
1549        else
1550                error = invfo_chmod(fp, mode, active_cred, td);
1551        return (error);
1552}
1553
1554static int
1555pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
1556    struct thread *td)
1557{
1558        struct pipe *cpipe;
1559        int error;
1560
1561        cpipe = fp->f_data;
1562        if (cpipe->pipe_state & PIPE_NAMED)
1563                error = vn_chown(fp, uid, gid, active_cred, td);
1564        else
1565                error = invfo_chown(fp, uid, gid, active_cred, td);
1566        return (error);
1567}
1568
1569static int
1570pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1571{
1572        struct pipe *pi;
1573
1574        if (fp->f_type == DTYPE_FIFO)
1575                return (vn_fill_kinfo(fp, kif, fdp));
1576        kif->kf_type = KF_TYPE_PIPE;
1577        pi = fp->f_data;
1578        kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
1579        kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
1580        kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
1581        return (0);
1582}
1583
1584static void
1585pipe_free_kmem(struct pipe *cpipe)
1586{
1587
1588        KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1589            ("pipe_free_kmem: pipe mutex locked"));
1590
1591        if (cpipe->pipe_buffer.buffer != NULL) {
1592                atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
1593#ifndef __rtems__
1594                vm_map_remove(pipe_map,
1595                    (vm_offset_t)cpipe->pipe_buffer.buffer,
1596                    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1597#else /* __rtems__ */
1598                free(cpipe->pipe_buffer.buffer, M_TEMP);
1599#endif /* __rtems__ */
1600                cpipe->pipe_buffer.buffer = NULL;
1601        }
1602#ifndef PIPE_NODIRECT
1603        {
1604                cpipe->pipe_map.cnt = 0;
1605                cpipe->pipe_map.pos = 0;
1606                cpipe->pipe_map.npages = 0;
1607        }
1608#endif
1609}
1610
1611/*
1612 * shutdown the pipe
1613 */
1614static void
1615pipeclose(struct pipe *cpipe)
1616{
1617        struct pipepair *pp;
1618        struct pipe *ppipe;
1619
1620        KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1621
1622        PIPE_LOCK(cpipe);
1623        pipelock(cpipe, 0);
1624        pp = cpipe->pipe_pair;
1625
1626        pipeselwakeup(cpipe);
1627
1628        /*
1629         * If the other side is blocked, wake it up saying that
1630         * we want to close it down.
1631         */
1632        cpipe->pipe_state |= PIPE_EOF;
1633        while (cpipe->pipe_busy) {
1634                wakeup(cpipe);
1635                cpipe->pipe_state |= PIPE_WANT;
1636                pipeunlock(cpipe);
1637                msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1638                pipelock(cpipe, 0);
1639        }
1640
1641
1642        /*
1643         * Disconnect from peer, if any.
1644         */
1645        ppipe = cpipe->pipe_peer;
1646        if (ppipe->pipe_present == PIPE_ACTIVE) {
1647                pipeselwakeup(ppipe);
1648
1649                ppipe->pipe_state |= PIPE_EOF;
1650                wakeup(ppipe);
1651                KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1652        }
1653
1654        /*
1655         * Mark this endpoint as free.  Release kmem resources.  We
1656         * don't mark this endpoint as unused until we've finished
1657         * doing that, or the pipe might disappear out from under
1658         * us.
1659         */
1660        PIPE_UNLOCK(cpipe);
1661        pipe_free_kmem(cpipe);
1662        PIPE_LOCK(cpipe);
1663        cpipe->pipe_present = PIPE_CLOSING;
1664        pipeunlock(cpipe);
1665
1666        /*
1667         * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1668         * PIPE_FINALIZED, that allows other end to free the
1669         * pipe_pair, only after the knotes are completely dismantled.
1670         */
1671        knlist_clear(&cpipe->pipe_sel.si_note, 1);
1672        cpipe->pipe_present = PIPE_FINALIZED;
1673        seldrain(&cpipe->pipe_sel);
1674        knlist_destroy(&cpipe->pipe_sel.si_note);
1675
1676        /*
1677         * If both endpoints are now closed, release the memory for the
1678         * pipe pair.  If not, unlock.
1679         */
1680        if (ppipe->pipe_present == PIPE_FINALIZED) {
1681                PIPE_UNLOCK(cpipe);
1682#ifdef MAC
1683                mac_pipe_destroy(pp);
1684#endif
1685                uma_zfree(pipe_zone, cpipe->pipe_pair);
1686        } else
1687                PIPE_UNLOCK(cpipe);
1688}
1689
1690/*ARGSUSED*/
1691static int
1692pipe_kqfilter(struct file *fp, struct knote *kn)
1693{
1694        struct pipe *cpipe;
1695
1696        /*
1697         * If a filter is requested that is not supported by this file
1698         * descriptor, don't return an error, but also don't ever generate an
1699         * event.
1700         */
1701        if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
1702                kn->kn_fop = &pipe_nfiltops;
1703                return (0);
1704        }
1705        if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
1706                kn->kn_fop = &pipe_nfiltops;
1707                return (0);
1708        }
1709        cpipe = fp->f_data;
1710        PIPE_LOCK(cpipe);
1711        switch (kn->kn_filter) {
1712        case EVFILT_READ:
1713                kn->kn_fop = &pipe_rfiltops;
1714                break;
1715        case EVFILT_WRITE:
1716                kn->kn_fop = &pipe_wfiltops;
1717                if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1718                        /* other end of pipe has been closed */
1719                        PIPE_UNLOCK(cpipe);
1720                        return (EPIPE);
1721                }
1722                cpipe = PIPE_PEER(cpipe);
1723                break;
1724        default:
1725                PIPE_UNLOCK(cpipe);
1726                return (EINVAL);
1727        }
1728
1729        kn->kn_hook = cpipe;
1730        knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1731        PIPE_UNLOCK(cpipe);
1732        return (0);
1733}
1734
1735static void
1736filt_pipedetach(struct knote *kn)
1737{
1738        struct pipe *cpipe = kn->kn_hook;
1739
1740        PIPE_LOCK(cpipe);
1741        knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1742        PIPE_UNLOCK(cpipe);
1743}
1744
1745/*ARGSUSED*/
1746static int
1747filt_piperead(struct knote *kn, long hint)
1748{
1749        struct pipe *rpipe = kn->kn_hook;
1750        struct pipe *wpipe = rpipe->pipe_peer;
1751        int ret;
1752
1753        PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
1754        kn->kn_data = rpipe->pipe_buffer.cnt;
1755        if (kn->kn_data == 0)
1756                kn->kn_data = rpipe->pipe_map.cnt;
1757
1758        if ((rpipe->pipe_state & PIPE_EOF) ||
1759            wpipe->pipe_present != PIPE_ACTIVE ||
1760            (wpipe->pipe_state & PIPE_EOF)) {
1761                kn->kn_flags |= EV_EOF;
1762                return (1);
1763        }
1764        ret = kn->kn_data > 0;
1765        return ret;
1766}
1767
1768/*ARGSUSED*/
1769static int
1770filt_pipewrite(struct knote *kn, long hint)
1771{
1772        struct pipe *wpipe;
1773
1774        /*
1775         * If this end of the pipe is closed, the knote was removed from the
1776         * knlist and the list lock (i.e., the pipe lock) is therefore not held.
1777         */
1778        wpipe = kn->kn_hook;
1779        if (wpipe->pipe_present != PIPE_ACTIVE ||
1780            (wpipe->pipe_state & PIPE_EOF)) {
1781                kn->kn_data = 0;
1782                kn->kn_flags |= EV_EOF;
1783                return (1);
1784        }
1785        PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
1786        kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
1787            (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF;
1788        if (wpipe->pipe_state & PIPE_DIRECTW)
1789                kn->kn_data = 0;
1790
1791        return (kn->kn_data >= PIPE_BUF);
1792}
1793
1794static void
1795filt_pipedetach_notsup(struct knote *kn)
1796{
1797
1798}
1799
1800static int
1801filt_pipenotsup(struct knote *kn, long hint)
1802{
1803
1804        return (0);
1805}
Note: See TracBrowser for help on using the repository browser.