source: rtems-libbsd/freebsd/sys/vm/uma_core.c @ 2b2563d

5
Last change on this file since 2b2563d was 2b2563d, checked in by Sebastian Huber <sebastian.huber@…>, on 12/20/18 at 10:12:40

Update to FreeBSD head 2018-12-20

Git mirror commit 19a6ceb89dbacf74697d493e48c388767126d418.

It includes an update of wpa_supplicant to version 2.7.

It includes an update of the OpenSSL baseline to version 1.1.1a.

Update #3472.

  • Property mode set to 100644
File size: 109.8 KB
Line 
1#include <machine/rtems-bsd-kernel-space.h>
2
3/*-
4 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
5 *
6 * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
7 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
8 * Copyright (c) 2004-2006 Robert N. M. Watson
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice unmodified, this list of conditions, and the following
16 *    disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * uma_core.c  Implementation of the Universal Memory allocator
35 *
36 * This allocator is intended to replace the multitude of similar object caches
37 * in the standard FreeBSD kernel.  The intent is to be flexible as well as
38 * efficient.  A primary design goal is to return unused memory to the rest of
39 * the system.  This will make the system as a whole more flexible due to the
40 * ability to move memory to subsystems which most need it instead of leaving
41 * pools of reserved memory unused.
42 *
43 * The basic ideas stem from similar slab/zone based allocators whose algorithms
44 * are well known.
45 *
46 */
47
48/*
49 * TODO:
50 *      - Improve memory usage for large allocations
51 *      - Investigate cache size adjustments
52 */
53
54#include <sys/cdefs.h>
55__FBSDID("$FreeBSD$");
56
57#include <rtems/bsd/local/opt_ddb.h>
58#include <rtems/bsd/local/opt_param.h>
59#include <rtems/bsd/local/opt_vm.h>
60
61#include <sys/param.h>
62#include <sys/systm.h>
63#include <sys/bitset.h>
64#include <sys/domainset.h>
65#include <sys/eventhandler.h>
66#include <sys/kernel.h>
67#include <sys/types.h>
68#include <sys/limits.h>
69#include <sys/queue.h>
70#include <sys/malloc.h>
71#include <sys/ktr.h>
72#include <sys/lock.h>
73#include <sys/sysctl.h>
74#include <sys/mutex.h>
75#include <sys/proc.h>
76#include <sys/random.h>
77#include <sys/rwlock.h>
78#include <sys/sbuf.h>
79#include <sys/sched.h>
80#include <sys/smp.h>
81#include <sys/taskqueue.h>
82#include <sys/vmmeter.h>
83
84#include <vm/vm.h>
85#include <vm/vm_domainset.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pageout.h>
89#include <vm/vm_param.h>
90#include <vm/vm_phys.h>
91#include <vm/vm_pagequeue.h>
92#include <vm/vm_map.h>
93#include <vm/vm_kern.h>
94#include <vm/vm_extern.h>
95#include <vm/uma.h>
96#include <vm/uma_int.h>
97#include <vm/uma_dbg.h>
98
99#include <ddb/ddb.h>
100#ifdef __rtems__
101#include <rtems/bsd/bsd.h>
102#include <rtems/malloc.h>
103#include <rtems.h>
104
105#undef CACHE_LINE_SIZE
106#define CACHE_LINE_SIZE CPU_CACHE_LINE_BYTES
107
108#ifdef RTEMS_SMP
109#include <rtems/score/smp.h>
110
111/*
112* It is essential that we have a per-processor cache, otherwise the
113* critical_enter()/critical_exit() protection would be insufficient.
114*/
115#undef curcpu
116#define curcpu _SMP_Get_current_processor()
117#undef mp_maxid
118#define mp_maxid (rtems_get_processor_count() - 1)
119#undef mp_ncpus
120#define mp_ncpus rtems_get_processor_count()
121#define SMP
122#endif /* RTEMS_SMP */
123#endif /* __rtems__ */
124
125#ifdef DEBUG_MEMGUARD
126#include <vm/memguard.h>
127#endif
128
129/*
130 * This is the zone and keg from which all zones are spawned.
131 */
132static uma_zone_t kegs;
133static uma_zone_t zones;
134
135/* This is the zone from which all offpage uma_slab_ts are allocated. */
136static uma_zone_t slabzone;
137
138/*
139 * The initial hash tables come out of this zone so they can be allocated
140 * prior to malloc coming up.
141 */
142static uma_zone_t hashzone;
143
144/* The boot-time adjusted value for cache line alignment. */
145int uma_align_cache = 64 - 1;
146
147static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
148
149#ifndef __rtems__
150/*
151 * Are we allowed to allocate buckets?
152 */
153static int bucketdisable = 1;
154#else /* __rtems__ */
155#define bucketdisable 0
156#endif /* __rtems__ */
157
158/* Linked list of all kegs in the system */
159static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
160
161/* Linked list of all cache-only zones in the system */
162static LIST_HEAD(,uma_zone) uma_cachezones =
163    LIST_HEAD_INITIALIZER(uma_cachezones);
164
165/* This RW lock protects the keg list */
166static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
167
168#ifndef __rtems__
169/*
170 * Pointer and counter to pool of pages, that is preallocated at
171 * startup to bootstrap UMA.
172 */
173static char *bootmem;
174static int boot_pages;
175#endif /* __rtems__ */
176
177static struct sx uma_drain_lock;
178
179/* kmem soft limit. */
180static unsigned long uma_kmem_limit = LONG_MAX;
181static volatile unsigned long uma_kmem_total;
182
183#ifndef __rtems__
184/* Is the VM done starting up? */
185static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
186    BOOT_RUNNING } booted = BOOT_COLD;
187#endif /* __rtems__ */
188
189/*
190 * This is the handle used to schedule events that need to happen
191 * outside of the allocation fast path.
192 */
193static struct callout uma_callout;
194#define UMA_TIMEOUT     20              /* Seconds for callout interval. */
195
196/*
197 * This structure is passed as the zone ctor arg so that I don't have to create
198 * a special allocation function just for zones.
199 */
200struct uma_zctor_args {
201        const char *name;
202        size_t size;
203        uma_ctor ctor;
204        uma_dtor dtor;
205        uma_init uminit;
206        uma_fini fini;
207        uma_import import;
208        uma_release release;
209        void *arg;
210        uma_keg_t keg;
211        int align;
212        uint32_t flags;
213};
214
215struct uma_kctor_args {
216        uma_zone_t zone;
217        size_t size;
218        uma_init uminit;
219        uma_fini fini;
220        int align;
221        uint32_t flags;
222};
223
224struct uma_bucket_zone {
225        uma_zone_t      ubz_zone;
226        char            *ubz_name;
227        int             ubz_entries;    /* Number of items it can hold. */
228        int             ubz_maxsize;    /* Maximum allocation size per-item. */
229};
230
231/*
232 * Compute the actual number of bucket entries to pack them in power
233 * of two sizes for more efficient space utilization.
234 */
235#define BUCKET_SIZE(n)                                          \
236    (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
237
238#ifndef __rtems__
239#define BUCKET_MAX      BUCKET_SIZE(256)
240#else /* __rtems__ */
241#define BUCKET_MAX      BUCKET_SIZE(128)
242#endif /* __rtems__ */
243
244struct uma_bucket_zone bucket_zones[] = {
245        { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
246        { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
247        { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
248        { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
249        { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
250        { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
251        { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
252        { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
253#ifndef __rtems__
254        { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
255#endif /* __rtems__ */
256        { NULL, NULL, 0}
257};
258
259/*
260 * Flags and enumerations to be passed to internal functions.
261 */
262enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
263
264#define UMA_ANYDOMAIN   -1      /* Special value for domain search. */
265
266/* Prototypes.. */
267
268#ifndef __rtems__
269int     uma_startup_count(int);
270#endif /* __rtems__ */
271void    uma_startup(void *, int);
272#ifndef __rtems__
273void    uma_startup1(void);
274void    uma_startup2(void);
275#endif /* __rtems__ */
276
277#ifndef __rtems__
278static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
279#endif /* __rtems__ */
280static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
281#ifndef __rtems__
282static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
283static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
284#endif /* __rtems__ */
285static void page_free(void *, vm_size_t, uint8_t);
286#ifndef __rtems__
287static void pcpu_page_free(void *, vm_size_t, uint8_t);
288#endif /* __rtems__ */
289static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
290static void cache_drain(uma_zone_t);
291static void bucket_drain(uma_zone_t, uma_bucket_t);
292static void bucket_cache_drain(uma_zone_t zone);
293static int keg_ctor(void *, int, void *, int);
294static void keg_dtor(void *, int, void *);
295static int zone_ctor(void *, int, void *, int);
296static void zone_dtor(void *, int, void *);
297static int zero_init(void *, int, int);
298static void keg_small_init(uma_keg_t keg);
299static void keg_large_init(uma_keg_t keg);
300static void zone_foreach(void (*zfunc)(uma_zone_t));
301static void zone_timeout(uma_zone_t zone);
302static int hash_alloc(struct uma_hash *);
303static int hash_expand(struct uma_hash *, struct uma_hash *);
304static void hash_free(struct uma_hash *hash);
305static void uma_timeout(void *);
306static void uma_startup3(void);
307static void *zone_alloc_item(uma_zone_t, void *, int, int);
308static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
309static void bucket_enable(void);
310static void bucket_init(void);
311static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
312static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
313static void bucket_zone_drain(void);
314static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
315static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
316#ifndef __rtems__
317static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
318#endif /* __rtems__ */
319static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
320static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
321static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
322    uma_fini fini, int align, uint32_t flags);
323static int zone_import(uma_zone_t, void **, int, int, int);
324static void zone_release(uma_zone_t, void **, int);
325static void uma_zero_item(void *, uma_zone_t);
326
327void uma_print_zone(uma_zone_t);
328void uma_print_stats(void);
329static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
330static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
331
332#ifdef INVARIANTS
333static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
334static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
335static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
336static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
337
338static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
339    "Memory allocation debugging");
340
341static u_int dbg_divisor = 1;
342SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
343    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
344    "Debug & thrash every this item in memory allocator");
345
346static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
347static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
348SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
349    &uma_dbg_cnt, "memory items debugged");
350SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
351    &uma_skip_cnt, "memory items skipped, not debugged");
352#endif
353
354SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
355
356SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
357    0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
358
359SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
360    0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
361
362static int zone_warnings = 1;
363SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
364    "Warn when UMA zones becomes full");
365
366/* Adjust bytes under management by UMA. */
367static inline void
368uma_total_dec(unsigned long size)
369{
370
371        atomic_subtract_long(&uma_kmem_total, size);
372}
373
374static inline void
375uma_total_inc(unsigned long size)
376{
377
378        if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
379                uma_reclaim_wakeup();
380}
381
382/*
383 * This routine checks to see whether or not it's safe to enable buckets.
384 */
385static void
386bucket_enable(void)
387{
388#ifndef __rtems__
389        bucketdisable = vm_page_count_min();
390#endif /* __rtems__ */
391}
392
393/*
394 * Initialize bucket_zones, the array of zones of buckets of various sizes.
395 *
396 * For each zone, calculate the memory required for each bucket, consisting
397 * of the header and an array of pointers.
398 */
399static void
400bucket_init(void)
401{
402        struct uma_bucket_zone *ubz;
403        int size;
404
405        for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
406                size = roundup(sizeof(struct uma_bucket), sizeof(void *));
407                size += sizeof(void *) * ubz->ubz_entries;
408                ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
409                    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
410#ifndef __rtems__
411                    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
412#else /* __rtems__ */
413                    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
414#endif /* __rtems__ */
415        }
416}
417
418/*
419 * Given a desired number of entries for a bucket, return the zone from which
420 * to allocate the bucket.
421 */
422static struct uma_bucket_zone *
423bucket_zone_lookup(int entries)
424{
425        struct uma_bucket_zone *ubz;
426
427        for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
428                if (ubz->ubz_entries >= entries)
429                        return (ubz);
430        ubz--;
431        return (ubz);
432}
433
434static int
435bucket_select(int size)
436{
437        struct uma_bucket_zone *ubz;
438
439        ubz = &bucket_zones[0];
440        if (size > ubz->ubz_maxsize)
441                return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
442
443        for (; ubz->ubz_entries != 0; ubz++)
444                if (ubz->ubz_maxsize < size)
445                        break;
446        ubz--;
447        return (ubz->ubz_entries);
448}
449
450static uma_bucket_t
451bucket_alloc(uma_zone_t zone, void *udata, int flags)
452{
453        struct uma_bucket_zone *ubz;
454        uma_bucket_t bucket;
455
456#ifndef __rtems__
457        /*
458         * This is to stop us from allocating per cpu buckets while we're
459         * running out of vm.boot_pages.  Otherwise, we would exhaust the
460         * boot pages.  This also prevents us from allocating buckets in
461         * low memory situations.
462         */
463        if (bucketdisable)
464                return (NULL);
465#endif /* __rtems__ */
466        /*
467         * To limit bucket recursion we store the original zone flags
468         * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
469         * NOVM flag to persist even through deep recursions.  We also
470         * store ZFLAG_BUCKET once we have recursed attempting to allocate
471         * a bucket for a bucket zone so we do not allow infinite bucket
472         * recursion.  This cookie will even persist to frees of unused
473         * buckets via the allocation path or bucket allocations in the
474         * free path.
475         */
476        if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
477                udata = (void *)(uintptr_t)zone->uz_flags;
478        else {
479                if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
480                        return (NULL);
481                udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
482        }
483        if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
484                flags |= M_NOVM;
485        ubz = bucket_zone_lookup(zone->uz_count);
486        if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
487                ubz++;
488        bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
489        if (bucket) {
490#ifdef INVARIANTS
491                bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
492#endif
493                bucket->ub_cnt = 0;
494                bucket->ub_entries = ubz->ubz_entries;
495        }
496
497        return (bucket);
498}
499
500static void
501bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
502{
503        struct uma_bucket_zone *ubz;
504
505        KASSERT(bucket->ub_cnt == 0,
506            ("bucket_free: Freeing a non free bucket."));
507        if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
508                udata = (void *)(uintptr_t)zone->uz_flags;
509        ubz = bucket_zone_lookup(bucket->ub_entries);
510        uma_zfree_arg(ubz->ubz_zone, bucket, udata);
511}
512
513static void
514bucket_zone_drain(void)
515{
516        struct uma_bucket_zone *ubz;
517
518        for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
519                zone_drain(ubz->ubz_zone);
520}
521
522static uma_bucket_t
523zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws)
524{
525        uma_bucket_t bucket;
526
527        ZONE_LOCK_ASSERT(zone);
528
529        if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
530                MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
531                LIST_REMOVE(bucket, ub_link);
532                zdom->uzd_nitems -= bucket->ub_cnt;
533                if (ws && zdom->uzd_imin > zdom->uzd_nitems)
534                        zdom->uzd_imin = zdom->uzd_nitems;
535        }
536        return (bucket);
537}
538
539static void
540zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
541    const bool ws)
542{
543
544        ZONE_LOCK_ASSERT(zone);
545
546        LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
547        zdom->uzd_nitems += bucket->ub_cnt;
548        if (ws && zdom->uzd_imax < zdom->uzd_nitems)
549                zdom->uzd_imax = zdom->uzd_nitems;
550}
551
552static void
553zone_log_warning(uma_zone_t zone)
554{
555        static const struct timeval warninterval = { 300, 0 };
556
557        if (!zone_warnings || zone->uz_warning == NULL)
558                return;
559
560        if (ratecheck(&zone->uz_ratecheck, &warninterval))
561                printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
562}
563
564static inline void
565zone_maxaction(uma_zone_t zone)
566{
567
568        if (zone->uz_maxaction.ta_func != NULL)
569                taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
570}
571
572static void
573zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
574{
575        uma_klink_t klink;
576
577        LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
578                kegfn(klink->kl_keg);
579}
580
581/*
582 * Routine called by timeout which is used to fire off some time interval
583 * based calculations.  (stats, hash size, etc.)
584 *
585 * Arguments:
586 *      arg   Unused
587 *
588 * Returns:
589 *      Nothing
590 */
591static void
592uma_timeout(void *unused)
593{
594        bucket_enable();
595        zone_foreach(zone_timeout);
596
597        /* Reschedule this event */
598        callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
599}
600
601/*
602 * Update the working set size estimate for the zone's bucket cache.
603 * The constants chosen here are somewhat arbitrary.  With an update period of
604 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
605 * last 100s.
606 */
607static void
608zone_domain_update_wss(uma_zone_domain_t zdom)
609{
610        long wss;
611
612        MPASS(zdom->uzd_imax >= zdom->uzd_imin);
613        wss = zdom->uzd_imax - zdom->uzd_imin;
614        zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
615        zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5;
616}
617
618/*
619 * Routine to perform timeout driven calculations.  This expands the
620 * hashes and does per cpu statistics aggregation.
621 *
622 *  Returns nothing.
623 */
624static void
625keg_timeout(uma_keg_t keg)
626{
627
628        KEG_LOCK(keg);
629        /*
630         * Expand the keg hash table.
631         *
632         * This is done if the number of slabs is larger than the hash size.
633         * What I'm trying to do here is completely reduce collisions.  This
634         * may be a little aggressive.  Should I allow for two collisions max?
635         */
636        if (keg->uk_flags & UMA_ZONE_HASH &&
637            keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
638                struct uma_hash newhash;
639                struct uma_hash oldhash;
640                int ret;
641
642                /*
643                 * This is so involved because allocating and freeing
644                 * while the keg lock is held will lead to deadlock.
645                 * I have to do everything in stages and check for
646                 * races.
647                 */
648                newhash = keg->uk_hash;
649                KEG_UNLOCK(keg);
650                ret = hash_alloc(&newhash);
651                KEG_LOCK(keg);
652                if (ret) {
653                        if (hash_expand(&keg->uk_hash, &newhash)) {
654                                oldhash = keg->uk_hash;
655                                keg->uk_hash = newhash;
656                        } else
657                                oldhash = newhash;
658
659                        KEG_UNLOCK(keg);
660                        hash_free(&oldhash);
661                        return;
662                }
663        }
664        KEG_UNLOCK(keg);
665}
666
667static void
668zone_timeout(uma_zone_t zone)
669{
670        int i;
671
672        zone_foreach_keg(zone, &keg_timeout);
673
674        ZONE_LOCK(zone);
675        for (i = 0; i < vm_ndomains; i++)
676                zone_domain_update_wss(&zone->uz_domain[i]);
677        ZONE_UNLOCK(zone);
678}
679
680/*
681 * Allocate and zero fill the next sized hash table from the appropriate
682 * backing store.
683 *
684 * Arguments:
685 *      hash  A new hash structure with the old hash size in uh_hashsize
686 *
687 * Returns:
688 *      1 on success and 0 on failure.
689 */
690static int
691hash_alloc(struct uma_hash *hash)
692{
693        int oldsize;
694        int alloc;
695
696        oldsize = hash->uh_hashsize;
697
698        /* We're just going to go to a power of two greater */
699        if (oldsize)  {
700                hash->uh_hashsize = oldsize * 2;
701                alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
702                hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
703                    M_UMAHASH, M_NOWAIT);
704        } else {
705                alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
706                hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
707                    UMA_ANYDOMAIN, M_WAITOK);
708                hash->uh_hashsize = UMA_HASH_SIZE_INIT;
709        }
710        if (hash->uh_slab_hash) {
711                bzero(hash->uh_slab_hash, alloc);
712                hash->uh_hashmask = hash->uh_hashsize - 1;
713                return (1);
714        }
715
716        return (0);
717}
718
719/*
720 * Expands the hash table for HASH zones.  This is done from zone_timeout
721 * to reduce collisions.  This must not be done in the regular allocation
722 * path, otherwise, we can recurse on the vm while allocating pages.
723 *
724 * Arguments:
725 *      oldhash  The hash you want to expand
726 *      newhash  The hash structure for the new table
727 *
728 * Returns:
729 *      Nothing
730 *
731 * Discussion:
732 */
733static int
734hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
735{
736        uma_slab_t slab;
737        int hval;
738        int i;
739
740        if (!newhash->uh_slab_hash)
741                return (0);
742
743        if (oldhash->uh_hashsize >= newhash->uh_hashsize)
744                return (0);
745
746        /*
747         * I need to investigate hash algorithms for resizing without a
748         * full rehash.
749         */
750
751        for (i = 0; i < oldhash->uh_hashsize; i++)
752                while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
753                        slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
754                        SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
755                        hval = UMA_HASH(newhash, slab->us_data);
756                        SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
757                            slab, us_hlink);
758                }
759
760        return (1);
761}
762
763/*
764 * Free the hash bucket to the appropriate backing store.
765 *
766 * Arguments:
767 *      slab_hash  The hash bucket we're freeing
768 *      hashsize   The number of entries in that hash bucket
769 *
770 * Returns:
771 *      Nothing
772 */
773static void
774hash_free(struct uma_hash *hash)
775{
776        if (hash->uh_slab_hash == NULL)
777                return;
778        if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
779                zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
780        else
781                free(hash->uh_slab_hash, M_UMAHASH);
782}
783
784/*
785 * Frees all outstanding items in a bucket
786 *
787 * Arguments:
788 *      zone   The zone to free to, must be unlocked.
789 *      bucket The free/alloc bucket with items, cpu queue must be locked.
790 *
791 * Returns:
792 *      Nothing
793 */
794
795static void
796bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
797{
798        int i;
799
800        if (bucket == NULL)
801                return;
802
803        if (zone->uz_fini)
804                for (i = 0; i < bucket->ub_cnt; i++)
805                        zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
806        zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
807        bucket->ub_cnt = 0;
808}
809
810/*
811 * Drains the per cpu caches for a zone.
812 *
813 * NOTE: This may only be called while the zone is being turn down, and not
814 * during normal operation.  This is necessary in order that we do not have
815 * to migrate CPUs to drain the per-CPU caches.
816 *
817 * Arguments:
818 *      zone     The zone to drain, must be unlocked.
819 *
820 * Returns:
821 *      Nothing
822 */
823static void
824cache_drain(uma_zone_t zone)
825{
826        uma_cache_t cache;
827        int cpu;
828
829        /*
830         * XXX: It is safe to not lock the per-CPU caches, because we're
831         * tearing down the zone anyway.  I.e., there will be no further use
832         * of the caches at this point.
833         *
834         * XXX: It would good to be able to assert that the zone is being
835         * torn down to prevent improper use of cache_drain().
836         *
837         * XXX: We lock the zone before passing into bucket_cache_drain() as
838         * it is used elsewhere.  Should the tear-down path be made special
839         * there in some form?
840         */
841        CPU_FOREACH(cpu) {
842                cache = &zone->uz_cpu[cpu];
843                bucket_drain(zone, cache->uc_allocbucket);
844                bucket_drain(zone, cache->uc_freebucket);
845                if (cache->uc_allocbucket != NULL)
846                        bucket_free(zone, cache->uc_allocbucket, NULL);
847                if (cache->uc_freebucket != NULL)
848                        bucket_free(zone, cache->uc_freebucket, NULL);
849                cache->uc_allocbucket = cache->uc_freebucket = NULL;
850        }
851        ZONE_LOCK(zone);
852        bucket_cache_drain(zone);
853        ZONE_UNLOCK(zone);
854}
855
856#ifndef __rtems__
857static void
858cache_shrink(uma_zone_t zone)
859{
860
861        if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
862                return;
863
864        ZONE_LOCK(zone);
865        zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
866        ZONE_UNLOCK(zone);
867}
868
869static void
870cache_drain_safe_cpu(uma_zone_t zone)
871{
872        uma_cache_t cache;
873        uma_bucket_t b1, b2;
874        int domain;
875
876        if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
877                return;
878
879        b1 = b2 = NULL;
880        ZONE_LOCK(zone);
881        critical_enter();
882#ifndef __rtems__
883        if (zone->uz_flags & UMA_ZONE_NUMA)
884                domain = PCPU_GET(domain);
885        else
886#endif /* __rtems__ */
887                domain = 0;
888        cache = &zone->uz_cpu[curcpu];
889        if (cache->uc_allocbucket) {
890                if (cache->uc_allocbucket->ub_cnt != 0)
891                        zone_put_bucket(zone, &zone->uz_domain[domain],
892                            cache->uc_allocbucket, false);
893                else
894                        b1 = cache->uc_allocbucket;
895                cache->uc_allocbucket = NULL;
896        }
897        if (cache->uc_freebucket) {
898                if (cache->uc_freebucket->ub_cnt != 0)
899                        zone_put_bucket(zone, &zone->uz_domain[domain],
900                            cache->uc_freebucket, false);
901                else
902                        b2 = cache->uc_freebucket;
903                cache->uc_freebucket = NULL;
904        }
905        critical_exit();
906        ZONE_UNLOCK(zone);
907        if (b1)
908                bucket_free(zone, b1, NULL);
909        if (b2)
910                bucket_free(zone, b2, NULL);
911}
912
913/*
914 * Safely drain per-CPU caches of a zone(s) to alloc bucket.
915 * This is an expensive call because it needs to bind to all CPUs
916 * one by one and enter a critical section on each of them in order
917 * to safely access their cache buckets.
918 * Zone lock must not be held on call this function.
919 */
920static void
921cache_drain_safe(uma_zone_t zone)
922{
923        int cpu;
924
925        /*
926         * Polite bucket sizes shrinking was not enouth, shrink aggressively.
927         */
928        if (zone)
929                cache_shrink(zone);
930        else
931                zone_foreach(cache_shrink);
932
933        CPU_FOREACH(cpu) {
934                thread_lock(curthread);
935                sched_bind(curthread, cpu);
936                thread_unlock(curthread);
937
938                if (zone)
939                        cache_drain_safe_cpu(zone);
940                else
941                        zone_foreach(cache_drain_safe_cpu);
942        }
943        thread_lock(curthread);
944        sched_unbind(curthread);
945        thread_unlock(curthread);
946}
947#endif /* __rtems__ */
948
949/*
950 * Drain the cached buckets from a zone.  Expects a locked zone on entry.
951 */
952static void
953bucket_cache_drain(uma_zone_t zone)
954{
955        uma_zone_domain_t zdom;
956        uma_bucket_t bucket;
957        int i;
958
959        /*
960         * Drain the bucket queues and free the buckets.
961         */
962        for (i = 0; i < vm_ndomains; i++) {
963                zdom = &zone->uz_domain[i];
964                while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) !=
965                    NULL) {
966                        ZONE_UNLOCK(zone);
967                        bucket_drain(zone, bucket);
968                        bucket_free(zone, bucket, NULL);
969                        ZONE_LOCK(zone);
970                }
971        }
972
973        /*
974         * Shrink further bucket sizes.  Price of single zone lock collision
975         * is probably lower then price of global cache drain.
976         */
977        if (zone->uz_count > zone->uz_count_min)
978                zone->uz_count--;
979}
980
981static void
982keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
983{
984        uint8_t *mem;
985        int i;
986        uint8_t flags;
987
988        CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
989            keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
990
991        mem = slab->us_data;
992        flags = slab->us_flags;
993        i = start;
994        if (keg->uk_fini != NULL) {
995                for (i--; i > -1; i--)
996#ifdef INVARIANTS
997                /*
998                 * trash_fini implies that dtor was trash_dtor. trash_fini
999                 * would check that memory hasn't been modified since free,
1000                 * which executed trash_dtor.
1001                 * That's why we need to run uma_dbg_kskip() check here,
1002                 * albeit we don't make skip check for other init/fini
1003                 * invocations.
1004                 */
1005                if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
1006                    keg->uk_fini != trash_fini)
1007#endif
1008                        keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
1009                            keg->uk_size);
1010        }
1011        if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1012                zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1013        keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1014        uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1015}
1016
1017/*
1018 * Frees pages from a keg back to the system.  This is done on demand from
1019 * the pageout daemon.
1020 *
1021 * Returns nothing.
1022 */
1023static void
1024keg_drain(uma_keg_t keg)
1025{
1026        struct slabhead freeslabs = { 0 };
1027        uma_domain_t dom;
1028        uma_slab_t slab, tmp;
1029        int i;
1030
1031        /*
1032         * We don't want to take pages from statically allocated kegs at this
1033         * time
1034         */
1035        if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1036                return;
1037
1038        CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
1039            keg->uk_name, keg, keg->uk_free);
1040        KEG_LOCK(keg);
1041        if (keg->uk_free == 0)
1042                goto finished;
1043
1044        for (i = 0; i < vm_ndomains; i++) {
1045                dom = &keg->uk_domain[i];
1046                LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
1047#ifndef __rtems__
1048                        /* We have nowhere to free these to. */
1049                        if (slab->us_flags & UMA_SLAB_BOOT)
1050                                continue;
1051#endif /* __rtems__ */
1052
1053                        LIST_REMOVE(slab, us_link);
1054                        keg->uk_pages -= keg->uk_ppera;
1055                        keg->uk_free -= keg->uk_ipers;
1056
1057                        if (keg->uk_flags & UMA_ZONE_HASH)
1058                                UMA_HASH_REMOVE(&keg->uk_hash, slab,
1059                                    slab->us_data);
1060
1061                        SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1062                }
1063        }
1064
1065finished:
1066        KEG_UNLOCK(keg);
1067
1068        while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1069                SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1070                keg_free_slab(keg, slab, keg->uk_ipers);
1071        }
1072}
1073
1074static void
1075zone_drain_wait(uma_zone_t zone, int waitok)
1076{
1077
1078        /*
1079         * Set draining to interlock with zone_dtor() so we can release our
1080         * locks as we go.  Only dtor() should do a WAITOK call since it
1081         * is the only call that knows the structure will still be available
1082         * when it wakes up.
1083         */
1084        ZONE_LOCK(zone);
1085        while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
1086                if (waitok == M_NOWAIT)
1087                        goto out;
1088                msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1089        }
1090        zone->uz_flags |= UMA_ZFLAG_DRAINING;
1091        bucket_cache_drain(zone);
1092        ZONE_UNLOCK(zone);
1093        /*
1094         * The DRAINING flag protects us from being freed while
1095         * we're running.  Normally the uma_rwlock would protect us but we
1096         * must be able to release and acquire the right lock for each keg.
1097         */
1098        zone_foreach_keg(zone, &keg_drain);
1099        ZONE_LOCK(zone);
1100        zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
1101        wakeup(zone);
1102out:
1103        ZONE_UNLOCK(zone);
1104}
1105
1106void
1107zone_drain(uma_zone_t zone)
1108{
1109
1110        zone_drain_wait(zone, M_NOWAIT);
1111}
1112
1113/*
1114 * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1115 * If the allocation was successful, the keg lock will be held upon return,
1116 * otherwise the keg will be left unlocked.
1117 *
1118 * Arguments:
1119 *      wait  Shall we wait?
1120 *
1121 * Returns:
1122 *      The slab that was allocated or NULL if there is no memory and the
1123 *      caller specified M_NOWAIT.
1124 */
1125static uma_slab_t
1126keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
1127{
1128        uma_alloc allocf;
1129        uma_slab_t slab;
1130        unsigned long size;
1131        uint8_t *mem;
1132        uint8_t flags;
1133        int i;
1134
1135        KASSERT(domain >= 0 && domain < vm_ndomains,
1136            ("keg_alloc_slab: domain %d out of range", domain));
1137        mtx_assert(&keg->uk_lock, MA_OWNED);
1138
1139        allocf = keg->uk_allocf;
1140        KEG_UNLOCK(keg);
1141
1142        slab = NULL;
1143        mem = NULL;
1144        if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1145                slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
1146                if (slab == NULL)
1147                        goto out;
1148        }
1149
1150        /*
1151         * This reproduces the old vm_zone behavior of zero filling pages the
1152         * first time they are added to a zone.
1153         *
1154         * Malloced items are zeroed in uma_zalloc.
1155         */
1156
1157        if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1158                wait |= M_ZERO;
1159        else
1160                wait &= ~M_ZERO;
1161
1162        if (keg->uk_flags & UMA_ZONE_NODUMP)
1163                wait |= M_NODUMP;
1164
1165        /* zone is passed for legacy reasons. */
1166        size = keg->uk_ppera * PAGE_SIZE;
1167        mem = allocf(zone, size, domain, &flags, wait);
1168        if (mem == NULL) {
1169                if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1170                        zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1171                slab = NULL;
1172                goto out;
1173        }
1174        uma_total_inc(size);
1175
1176        /* Point the slab into the allocated memory */
1177        if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1178                slab = (uma_slab_t )(mem + keg->uk_pgoff);
1179
1180        if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1181                for (i = 0; i < keg->uk_ppera; i++)
1182                        vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1183
1184        slab->us_keg = keg;
1185        slab->us_data = mem;
1186        slab->us_freecount = keg->uk_ipers;
1187        slab->us_flags = flags;
1188        slab->us_domain = domain;
1189        BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1190#ifdef INVARIANTS
1191        BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1192#endif
1193
1194        if (keg->uk_init != NULL) {
1195                for (i = 0; i < keg->uk_ipers; i++)
1196                        if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1197                            keg->uk_size, wait) != 0)
1198                                break;
1199                if (i != keg->uk_ipers) {
1200                        keg_free_slab(keg, slab, i);
1201                        slab = NULL;
1202                        goto out;
1203                }
1204        }
1205        KEG_LOCK(keg);
1206
1207        CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1208            slab, keg->uk_name, keg);
1209
1210        if (keg->uk_flags & UMA_ZONE_HASH)
1211                UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1212
1213        keg->uk_pages += keg->uk_ppera;
1214        keg->uk_free += keg->uk_ipers;
1215
1216out:
1217        return (slab);
1218}
1219
1220#ifndef __rtems__
1221/*
1222 * This function is intended to be used early on in place of page_alloc() so
1223 * that we may use the boot time page cache to satisfy allocations before
1224 * the VM is ready.
1225 */
1226static void *
1227startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1228    int wait)
1229{
1230        uma_keg_t keg;
1231        void *mem;
1232        int pages;
1233
1234        keg = zone_first_keg(zone);
1235
1236        /*
1237         * If we are in BOOT_BUCKETS or higher, than switch to real
1238         * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1239         */
1240        switch (booted) {
1241                case BOOT_COLD:
1242                case BOOT_STRAPPED:
1243                        break;
1244                case BOOT_PAGEALLOC:
1245                        if (keg->uk_ppera > 1)
1246                                break;
1247                case BOOT_BUCKETS:
1248                case BOOT_RUNNING:
1249#ifdef UMA_MD_SMALL_ALLOC
1250                        keg->uk_allocf = (keg->uk_ppera > 1) ?
1251                            page_alloc : uma_small_alloc;
1252#else
1253                        keg->uk_allocf = page_alloc;
1254#endif
1255                        return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1256        }
1257
1258        /*
1259         * Check our small startup cache to see if it has pages remaining.
1260         */
1261        pages = howmany(bytes, PAGE_SIZE);
1262        KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1263        if (pages > boot_pages)
1264                panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1265#ifdef DIAGNOSTIC
1266        printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1267            boot_pages);
1268#endif
1269        mem = bootmem;
1270        boot_pages -= pages;
1271        bootmem += pages * PAGE_SIZE;
1272        *pflag = UMA_SLAB_BOOT;
1273
1274        return (mem);
1275}
1276#endif /* __rtems__ */
1277
1278/*
1279 * Allocates a number of pages from the system
1280 *
1281 * Arguments:
1282 *      bytes  The number of bytes requested
1283 *      wait  Shall we wait?
1284 *
1285 * Returns:
1286 *      A pointer to the alloced memory or possibly
1287 *      NULL if M_NOWAIT is set.
1288 */
1289static void *
1290page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1291    int wait)
1292{
1293        void *p;        /* Returned page */
1294
1295#ifndef __rtems__
1296        *pflag = UMA_SLAB_KERNEL;
1297        p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1298#else /* __rtems__ */
1299        *pflag = 0;
1300        p = rtems_bsd_page_alloc(bytes, wait);
1301#endif /* __rtems__ */
1302
1303        return (p);
1304}
1305
1306#ifndef __rtems__
1307static void *
1308pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1309    int wait)
1310{
1311        struct pglist alloctail;
1312        vm_offset_t addr, zkva;
1313        int cpu, flags;
1314        vm_page_t p, p_next;
1315#ifdef NUMA
1316        struct pcpu *pc;
1317#endif
1318
1319        MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1320
1321        TAILQ_INIT(&alloctail);
1322        flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1323            malloc2vm_flags(wait);
1324        *pflag = UMA_SLAB_KERNEL;
1325        for (cpu = 0; cpu <= mp_maxid; cpu++) {
1326                if (CPU_ABSENT(cpu)) {
1327                        p = vm_page_alloc(NULL, 0, flags);
1328                } else {
1329#ifndef NUMA
1330                        p = vm_page_alloc(NULL, 0, flags);
1331#else
1332                        pc = pcpu_find(cpu);
1333                        p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1334                        if (__predict_false(p == NULL))
1335                                p = vm_page_alloc(NULL, 0, flags);
1336#endif
1337                }
1338                if (__predict_false(p == NULL))
1339                        goto fail;
1340                TAILQ_INSERT_TAIL(&alloctail, p, listq);
1341        }
1342        if ((addr = kva_alloc(bytes)) == 0)
1343                goto fail;
1344        zkva = addr;
1345        TAILQ_FOREACH(p, &alloctail, listq) {
1346                pmap_qenter(zkva, &p, 1);
1347                zkva += PAGE_SIZE;
1348        }
1349        return ((void*)addr);
1350 fail:
1351        TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1352                vm_page_unwire(p, PQ_NONE);
1353                vm_page_free(p);
1354        }
1355        return (NULL);
1356}
1357
1358/*
1359 * Allocates a number of pages from within an object
1360 *
1361 * Arguments:
1362 *      bytes  The number of bytes requested
1363 *      wait   Shall we wait?
1364 *
1365 * Returns:
1366 *      A pointer to the alloced memory or possibly
1367 *      NULL if M_NOWAIT is set.
1368 */
1369static void *
1370noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1371    int wait)
1372{
1373        TAILQ_HEAD(, vm_page) alloctail;
1374        u_long npages;
1375        vm_offset_t retkva, zkva;
1376        vm_page_t p, p_next;
1377        uma_keg_t keg;
1378
1379        TAILQ_INIT(&alloctail);
1380        keg = zone_first_keg(zone);
1381
1382        npages = howmany(bytes, PAGE_SIZE);
1383        while (npages > 0) {
1384                p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1385                    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1386                    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1387                    VM_ALLOC_NOWAIT));
1388                if (p != NULL) {
1389                        /*
1390                         * Since the page does not belong to an object, its
1391                         * listq is unused.
1392                         */
1393                        TAILQ_INSERT_TAIL(&alloctail, p, listq);
1394                        npages--;
1395                        continue;
1396                }
1397                /*
1398                 * Page allocation failed, free intermediate pages and
1399                 * exit.
1400                 */
1401                TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1402                        vm_page_unwire(p, PQ_NONE);
1403                        vm_page_free(p);
1404                }
1405                return (NULL);
1406        }
1407        *flags = UMA_SLAB_PRIV;
1408        zkva = keg->uk_kva +
1409            atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1410        retkva = zkva;
1411        TAILQ_FOREACH(p, &alloctail, listq) {
1412                pmap_qenter(zkva, &p, 1);
1413                zkva += PAGE_SIZE;
1414        }
1415
1416        return ((void *)retkva);
1417}
1418#endif /* __rtems__ */
1419
1420/*
1421 * Frees a number of pages to the system
1422 *
1423 * Arguments:
1424 *      mem   A pointer to the memory to be freed
1425 *      size  The size of the memory being freed
1426 *      flags The original p->us_flags field
1427 *
1428 * Returns:
1429 *      Nothing
1430 */
1431static void
1432page_free(void *mem, vm_size_t size, uint8_t flags)
1433{
1434#ifndef __rtems__
1435
1436        if ((flags & UMA_SLAB_KERNEL) == 0)
1437                panic("UMA: page_free used with invalid flags %x", flags);
1438
1439        kmem_free((vm_offset_t)mem, size);
1440#else /* __rtems__ */
1441        if (flags & UMA_SLAB_KERNEL)
1442                free(mem, M_TEMP);
1443        else
1444                rtems_bsd_page_free(mem);
1445#endif /* __rtems__ */
1446}
1447
1448#ifndef __rtems__
1449/*
1450 * Frees pcpu zone allocations
1451 *
1452 * Arguments:
1453 *      mem   A pointer to the memory to be freed
1454 *      size  The size of the memory being freed
1455 *      flags The original p->us_flags field
1456 *
1457 * Returns:
1458 *      Nothing
1459 */
1460static void
1461pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1462{
1463        vm_offset_t sva, curva;
1464        vm_paddr_t paddr;
1465        vm_page_t m;
1466
1467        MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1468        sva = (vm_offset_t)mem;
1469        for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1470                paddr = pmap_kextract(curva);
1471                m = PHYS_TO_VM_PAGE(paddr);
1472                vm_page_unwire(m, PQ_NONE);
1473                vm_page_free(m);
1474        }
1475        pmap_qremove(sva, size >> PAGE_SHIFT);
1476        kva_free(sva, size);
1477}
1478#endif /* __rtems__ */
1479
1480
1481/*
1482 * Zero fill initializer
1483 *
1484 * Arguments/Returns follow uma_init specifications
1485 */
1486static int
1487zero_init(void *mem, int size, int flags)
1488{
1489        bzero(mem, size);
1490        return (0);
1491}
1492
1493/*
1494 * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1495 *
1496 * Arguments
1497 *      keg  The zone we should initialize
1498 *
1499 * Returns
1500 *      Nothing
1501 */
1502static void
1503keg_small_init(uma_keg_t keg)
1504{
1505        u_int rsize;
1506        u_int memused;
1507        u_int wastedspace;
1508        u_int shsize;
1509        u_int slabsize;
1510
1511        if (keg->uk_flags & UMA_ZONE_PCPU) {
1512                u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1513
1514                slabsize = UMA_PCPU_ALLOC_SIZE;
1515                keg->uk_ppera = ncpus;
1516        } else {
1517                slabsize = UMA_SLAB_SIZE;
1518                keg->uk_ppera = 1;
1519        }
1520
1521        /*
1522         * Calculate the size of each allocation (rsize) according to
1523         * alignment.  If the requested size is smaller than we have
1524         * allocation bits for we round it up.
1525         */
1526        rsize = keg->uk_size;
1527        if (rsize < slabsize / SLAB_SETSIZE)
1528                rsize = slabsize / SLAB_SETSIZE;
1529        if (rsize & keg->uk_align)
1530                rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1531        keg->uk_rsize = rsize;
1532
1533        KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1534            keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1535            ("%s: size %u too large", __func__, keg->uk_rsize));
1536
1537        if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1538                shsize = 0;
1539        else
1540                shsize = SIZEOF_UMA_SLAB;
1541
1542        if (rsize <= slabsize - shsize)
1543                keg->uk_ipers = (slabsize - shsize) / rsize;
1544        else {
1545                /* Handle special case when we have 1 item per slab, so
1546                 * alignment requirement can be relaxed. */
1547                KASSERT(keg->uk_size <= slabsize - shsize,
1548                    ("%s: size %u greater than slab", __func__, keg->uk_size));
1549                keg->uk_ipers = 1;
1550        }
1551        KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1552            ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1553
1554        memused = keg->uk_ipers * rsize + shsize;
1555        wastedspace = slabsize - memused;
1556
1557        /*
1558         * We can't do OFFPAGE if we're internal or if we've been
1559         * asked to not go to the VM for buckets.  If we do this we
1560         * may end up going to the VM  for slabs which we do not
1561         * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1562         * of UMA_ZONE_VM, which clearly forbids it.
1563         */
1564        if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1565            (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1566                return;
1567
1568        /*
1569         * See if using an OFFPAGE slab will limit our waste.  Only do
1570         * this if it permits more items per-slab.
1571         *
1572         * XXX We could try growing slabsize to limit max waste as well.
1573         * Historically this was not done because the VM could not
1574         * efficiently handle contiguous allocations.
1575         */
1576        if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1577            (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1578                keg->uk_ipers = slabsize / keg->uk_rsize;
1579                KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1580                    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1581                CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1582                    "keg: %s(%p), calculated wastedspace = %d, "
1583                    "maximum wasted space allowed = %d, "
1584                    "calculated ipers = %d, "
1585                    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1586                    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1587                    slabsize - keg->uk_ipers * keg->uk_rsize);
1588                keg->uk_flags |= UMA_ZONE_OFFPAGE;
1589        }
1590
1591        if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1592            (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1593                keg->uk_flags |= UMA_ZONE_HASH;
1594}
1595
1596/*
1597 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1598 * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1599 * more complicated.
1600 *
1601 * Arguments
1602 *      keg  The keg we should initialize
1603 *
1604 * Returns
1605 *      Nothing
1606 */
1607static void
1608keg_large_init(uma_keg_t keg)
1609{
1610
1611        KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1612        KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1613            ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1614        KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1615            ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1616
1617        keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1618        keg->uk_ipers = 1;
1619        keg->uk_rsize = keg->uk_size;
1620
1621        /* Check whether we have enough space to not do OFFPAGE. */
1622        if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1623            PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
1624                /*
1625                 * We can't do OFFPAGE if we're internal, in which case
1626                 * we need an extra page per allocation to contain the
1627                 * slab header.
1628                 */
1629                if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1630                        keg->uk_flags |= UMA_ZONE_OFFPAGE;
1631                else
1632                        keg->uk_ppera++;
1633        }
1634
1635        if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1636            (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1637                keg->uk_flags |= UMA_ZONE_HASH;
1638}
1639
1640static void
1641keg_cachespread_init(uma_keg_t keg)
1642{
1643        int alignsize;
1644        int trailer;
1645        int pages;
1646        int rsize;
1647
1648        KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1649            ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1650
1651        alignsize = keg->uk_align + 1;
1652        rsize = keg->uk_size;
1653        /*
1654         * We want one item to start on every align boundary in a page.  To
1655         * do this we will span pages.  We will also extend the item by the
1656         * size of align if it is an even multiple of align.  Otherwise, it
1657         * would fall on the same boundary every time.
1658         */
1659        if (rsize & keg->uk_align)
1660                rsize = (rsize & ~keg->uk_align) + alignsize;
1661        if ((rsize & alignsize) == 0)
1662                rsize += alignsize;
1663        trailer = rsize - keg->uk_size;
1664        pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1665        pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1666        keg->uk_rsize = rsize;
1667        keg->uk_ppera = pages;
1668        keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1669        keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1670        KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1671            ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1672            keg->uk_ipers));
1673}
1674
1675/*
1676 * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1677 * the keg onto the global keg list.
1678 *
1679 * Arguments/Returns follow uma_ctor specifications
1680 *      udata  Actually uma_kctor_args
1681 */
1682static int
1683keg_ctor(void *mem, int size, void *udata, int flags)
1684{
1685        struct uma_kctor_args *arg = udata;
1686        uma_keg_t keg = mem;
1687        uma_zone_t zone;
1688
1689        bzero(keg, size);
1690        keg->uk_size = arg->size;
1691        keg->uk_init = arg->uminit;
1692        keg->uk_fini = arg->fini;
1693        keg->uk_align = arg->align;
1694        keg->uk_free = 0;
1695        keg->uk_reserve = 0;
1696        keg->uk_pages = 0;
1697        keg->uk_flags = arg->flags;
1698        keg->uk_slabzone = NULL;
1699
1700#ifndef __rtems__
1701        /*
1702         * We use a global round-robin policy by default.  Zones with
1703         * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1704         * iterator is never run.
1705         */
1706        keg->uk_dr.dr_policy = DOMAINSET_RR();
1707        keg->uk_dr.dr_iter = 0;
1708#endif /* __rtems__ */
1709
1710        /*
1711         * The master zone is passed to us at keg-creation time.
1712         */
1713        zone = arg->zone;
1714        keg->uk_name = zone->uz_name;
1715
1716        if (arg->flags & UMA_ZONE_VM)
1717                keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1718
1719        if (arg->flags & UMA_ZONE_ZINIT)
1720                keg->uk_init = zero_init;
1721
1722        if (arg->flags & UMA_ZONE_MALLOC)
1723                keg->uk_flags |= UMA_ZONE_VTOSLAB;
1724
1725        if (arg->flags & UMA_ZONE_PCPU)
1726#ifdef SMP
1727                keg->uk_flags |= UMA_ZONE_OFFPAGE;
1728#else
1729                keg->uk_flags &= ~UMA_ZONE_PCPU;
1730#endif
1731
1732        if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1733                keg_cachespread_init(keg);
1734        } else {
1735                if (keg->uk_size > UMA_SLAB_SPACE)
1736                        keg_large_init(keg);
1737                else
1738                        keg_small_init(keg);
1739        }
1740
1741        if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1742                keg->uk_slabzone = slabzone;
1743
1744#ifndef __rtems__
1745        /*
1746         * If we haven't booted yet we need allocations to go through the
1747         * startup cache until the vm is ready.
1748         */
1749        if (booted < BOOT_PAGEALLOC)
1750                keg->uk_allocf = startup_alloc;
1751#ifdef UMA_MD_SMALL_ALLOC
1752        else if (keg->uk_ppera == 1)
1753                keg->uk_allocf = uma_small_alloc;
1754#endif
1755        else if (keg->uk_flags & UMA_ZONE_PCPU)
1756                keg->uk_allocf = pcpu_page_alloc;
1757        else
1758#endif /* __rtems__ */
1759                keg->uk_allocf = page_alloc;
1760#ifndef __rtems__
1761#ifdef UMA_MD_SMALL_ALLOC
1762        if (keg->uk_ppera == 1)
1763                keg->uk_freef = uma_small_free;
1764        else
1765#endif
1766        if (keg->uk_flags & UMA_ZONE_PCPU)
1767                keg->uk_freef = pcpu_page_free;
1768        else
1769#endif /* __rtems__ */
1770                keg->uk_freef = page_free;
1771
1772        /*
1773         * Initialize keg's lock
1774         */
1775        KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1776
1777        /*
1778         * If we're putting the slab header in the actual page we need to
1779         * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
1780         * macro definition.
1781         */
1782        if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1783                keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
1784                /*
1785                 * The only way the following is possible is if with our
1786                 * UMA_ALIGN_PTR adjustments we are now bigger than
1787                 * UMA_SLAB_SIZE.  I haven't checked whether this is
1788                 * mathematically possible for all cases, so we make
1789                 * sure here anyway.
1790                 */
1791                KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
1792                    PAGE_SIZE * keg->uk_ppera,
1793                    ("zone %s ipers %d rsize %d size %d slab won't fit",
1794                    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1795        }
1796
1797        if (keg->uk_flags & UMA_ZONE_HASH)
1798                hash_alloc(&keg->uk_hash);
1799
1800        CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1801            keg, zone->uz_name, zone,
1802            (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1803            keg->uk_free);
1804
1805        LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1806
1807        rw_wlock(&uma_rwlock);
1808        LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1809        rw_wunlock(&uma_rwlock);
1810        return (0);
1811}
1812
1813/*
1814 * Zone header ctor.  This initializes all fields, locks, etc.
1815 *
1816 * Arguments/Returns follow uma_ctor specifications
1817 *      udata  Actually uma_zctor_args
1818 */
1819static int
1820zone_ctor(void *mem, int size, void *udata, int flags)
1821{
1822        struct uma_zctor_args *arg = udata;
1823        uma_zone_t zone = mem;
1824        uma_zone_t z;
1825        uma_keg_t keg;
1826
1827        bzero(zone, size);
1828        zone->uz_name = arg->name;
1829        zone->uz_ctor = arg->ctor;
1830        zone->uz_dtor = arg->dtor;
1831        zone->uz_slab = zone_fetch_slab;
1832        zone->uz_init = NULL;
1833        zone->uz_fini = NULL;
1834        zone->uz_allocs = 0;
1835        zone->uz_frees = 0;
1836        zone->uz_fails = 0;
1837        zone->uz_sleeps = 0;
1838        zone->uz_count = 0;
1839        zone->uz_count_min = 0;
1840        zone->uz_flags = 0;
1841        zone->uz_warning = NULL;
1842#ifndef __rtems__
1843        /* The domain structures follow the cpu structures. */
1844        zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1845#endif /* __rtems__ */
1846        timevalclear(&zone->uz_ratecheck);
1847        keg = arg->keg;
1848
1849        ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1850
1851        /*
1852         * This is a pure cache zone, no kegs.
1853         */
1854        if (arg->import) {
1855                if (arg->flags & UMA_ZONE_VM)
1856                        arg->flags |= UMA_ZFLAG_CACHEONLY;
1857                zone->uz_flags = arg->flags;
1858                zone->uz_size = arg->size;
1859                zone->uz_import = arg->import;
1860                zone->uz_release = arg->release;
1861                zone->uz_arg = arg->arg;
1862                zone->uz_lockptr = &zone->uz_lock;
1863                rw_wlock(&uma_rwlock);
1864                LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1865                rw_wunlock(&uma_rwlock);
1866                goto out;
1867        }
1868
1869        /*
1870         * Use the regular zone/keg/slab allocator.
1871         */
1872        zone->uz_import = (uma_import)zone_import;
1873        zone->uz_release = (uma_release)zone_release;
1874        zone->uz_arg = zone;
1875
1876        if (arg->flags & UMA_ZONE_SECONDARY) {
1877                KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1878                zone->uz_init = arg->uminit;
1879                zone->uz_fini = arg->fini;
1880                zone->uz_lockptr = &keg->uk_lock;
1881                zone->uz_flags |= UMA_ZONE_SECONDARY;
1882                rw_wlock(&uma_rwlock);
1883                ZONE_LOCK(zone);
1884                LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1885                        if (LIST_NEXT(z, uz_link) == NULL) {
1886                                LIST_INSERT_AFTER(z, zone, uz_link);
1887                                break;
1888                        }
1889                }
1890                ZONE_UNLOCK(zone);
1891                rw_wunlock(&uma_rwlock);
1892        } else if (keg == NULL) {
1893                if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1894                    arg->align, arg->flags)) == NULL)
1895                        return (ENOMEM);
1896        } else {
1897                struct uma_kctor_args karg;
1898                int error;
1899
1900                /* We should only be here from uma_startup() */
1901                karg.size = arg->size;
1902                karg.uminit = arg->uminit;
1903                karg.fini = arg->fini;
1904                karg.align = arg->align;
1905                karg.flags = arg->flags;
1906                karg.zone = zone;
1907                error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1908                    flags);
1909                if (error)
1910                        return (error);
1911        }
1912
1913        /*
1914         * Link in the first keg.
1915         */
1916        zone->uz_klink.kl_keg = keg;
1917        LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1918        zone->uz_lockptr = &keg->uk_lock;
1919        zone->uz_size = keg->uk_size;
1920        zone->uz_flags |= (keg->uk_flags &
1921            (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1922
1923        /*
1924         * Some internal zones don't have room allocated for the per cpu
1925         * caches.  If we're internal, bail out here.
1926         */
1927        if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1928                KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1929                    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1930                return (0);
1931        }
1932
1933out:
1934        KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1935            (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1936            ("Invalid zone flag combination"));
1937        if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
1938                zone->uz_count = BUCKET_MAX;
1939        else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1940                zone->uz_count = 0;
1941        else
1942                zone->uz_count = bucket_select(zone->uz_size);
1943        zone->uz_count_min = zone->uz_count;
1944
1945        return (0);
1946}
1947
1948/*
1949 * Keg header dtor.  This frees all data, destroys locks, frees the hash
1950 * table and removes the keg from the global list.
1951 *
1952 * Arguments/Returns follow uma_dtor specifications
1953 *      udata  unused
1954 */
1955static void
1956keg_dtor(void *arg, int size, void *udata)
1957{
1958        uma_keg_t keg;
1959
1960        keg = (uma_keg_t)arg;
1961        KEG_LOCK(keg);
1962        if (keg->uk_free != 0) {
1963                printf("Freed UMA keg (%s) was not empty (%d items). "
1964                    " Lost %d pages of memory.\n",
1965                    keg->uk_name ? keg->uk_name : "",
1966                    keg->uk_free, keg->uk_pages);
1967        }
1968        KEG_UNLOCK(keg);
1969
1970        hash_free(&keg->uk_hash);
1971
1972        KEG_LOCK_FINI(keg);
1973}
1974
1975/*
1976 * Zone header dtor.
1977 *
1978 * Arguments/Returns follow uma_dtor specifications
1979 *      udata  unused
1980 */
1981static void
1982zone_dtor(void *arg, int size, void *udata)
1983{
1984        uma_klink_t klink;
1985        uma_zone_t zone;
1986        uma_keg_t keg;
1987
1988        zone = (uma_zone_t)arg;
1989        keg = zone_first_keg(zone);
1990
1991        if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1992                cache_drain(zone);
1993
1994        rw_wlock(&uma_rwlock);
1995        LIST_REMOVE(zone, uz_link);
1996        rw_wunlock(&uma_rwlock);
1997        /*
1998         * XXX there are some races here where
1999         * the zone can be drained but zone lock
2000         * released and then refilled before we
2001         * remove it... we dont care for now
2002         */
2003        zone_drain_wait(zone, M_WAITOK);
2004        /*
2005         * Unlink all of our kegs.
2006         */
2007        while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
2008                klink->kl_keg = NULL;
2009                LIST_REMOVE(klink, kl_link);
2010                if (klink == &zone->uz_klink)
2011                        continue;
2012                free(klink, M_TEMP);
2013        }
2014        /*
2015         * We only destroy kegs from non secondary zones.
2016         */
2017        if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
2018                rw_wlock(&uma_rwlock);
2019                LIST_REMOVE(keg, uk_link);
2020                rw_wunlock(&uma_rwlock);
2021                zone_free_item(kegs, keg, NULL, SKIP_NONE);
2022        }
2023        ZONE_LOCK_FINI(zone);
2024}
2025
2026/*
2027 * Traverses every zone in the system and calls a callback
2028 *
2029 * Arguments:
2030 *      zfunc  A pointer to a function which accepts a zone
2031 *              as an argument.
2032 *
2033 * Returns:
2034 *      Nothing
2035 */
2036static void
2037zone_foreach(void (*zfunc)(uma_zone_t))
2038{
2039        uma_keg_t keg;
2040        uma_zone_t zone;
2041
2042        rw_rlock(&uma_rwlock);
2043        LIST_FOREACH(keg, &uma_kegs, uk_link) {
2044                LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2045                        zfunc(zone);
2046        }
2047        rw_runlock(&uma_rwlock);
2048}
2049
2050#ifndef __rtems__
2051/*
2052 * Count how many pages do we need to bootstrap.  VM supplies
2053 * its need in early zones in the argument, we add up our zones,
2054 * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
2055 * zone of zones and zone of kegs are accounted separately.
2056 */
2057#define UMA_BOOT_ZONES  11
2058#endif /* __rtems__ */
2059/* Zone of zones and zone of kegs have arbitrary alignment. */
2060#define UMA_BOOT_ALIGN  32
2061#ifndef __rtems__
2062static int zsize, ksize;
2063int
2064uma_startup_count(int vm_zones)
2065{
2066        int zones, pages;
2067
2068        ksize = sizeof(struct uma_keg) +
2069            (sizeof(struct uma_domain) * vm_ndomains);
2070        zsize = sizeof(struct uma_zone) +
2071            (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2072            (sizeof(struct uma_zone_domain) * vm_ndomains);
2073
2074        /*
2075         * Memory for the zone of kegs and its keg,
2076         * and for zone of zones.
2077         */
2078        pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2079            roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2080
2081#ifdef  UMA_MD_SMALL_ALLOC
2082        zones = UMA_BOOT_ZONES;
2083#else
2084        zones = UMA_BOOT_ZONES + vm_zones;
2085        vm_zones = 0;
2086#endif
2087
2088        /* Memory for the rest of startup zones, UMA and VM, ... */
2089        if (zsize > UMA_SLAB_SPACE) {
2090                /* See keg_large_init(). */
2091                u_int ppera;
2092
2093                ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2094                if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
2095                    SIZEOF_UMA_SLAB)
2096                        ppera++;
2097                pages += (zones + vm_zones) * ppera;
2098        } else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2099                /* See keg_small_init() special case for uk_ppera = 1. */
2100                pages += zones;
2101        else
2102                pages += howmany(zones,
2103                    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2104
2105        /* ... and their kegs. Note that zone of zones allocates a keg! */
2106        pages += howmany(zones + 1,
2107            UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2108
2109        /*
2110         * Most of startup zones are not going to be offpages, that's
2111         * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2112         * calculations.  Some large bucket zones will be offpage, and
2113         * thus will allocate hashes.  We take conservative approach
2114         * and assume that all zones may allocate hash.  This may give
2115         * us some positive inaccuracy, usually an extra single page.
2116         */
2117        pages += howmany(zones, UMA_SLAB_SPACE /
2118            (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2119
2120        return (pages);
2121}
2122#endif /* __rtems__ */
2123
2124void
2125uma_startup(void *mem, int npages)
2126{
2127        struct uma_zctor_args args;
2128        uma_keg_t masterkeg;
2129        uintptr_t m;
2130#ifdef __rtems__
2131        size_t zsize, ksize, size;
2132
2133        ksize = sizeof(struct uma_keg) +
2134            (sizeof(struct uma_domain) * vm_ndomains);
2135        zsize = sizeof(struct uma_zone) +
2136            (sizeof(struct uma_cache) * (mp_maxid + 1));
2137        size = 2 * roundup(zsize, CACHE_LINE_SIZE) +
2138            roundup(ksize, CACHE_LINE_SIZE);
2139#endif /* __rtems__ */
2140
2141#ifdef DIAGNOSTIC
2142        printf("Entering %s with %d boot pages configured\n", __func__, npages);
2143#endif
2144
2145        rw_init(&uma_rwlock, "UMA lock");
2146
2147#ifndef __rtems__
2148        /* Use bootpages memory for the zone of zones and zone of kegs. */
2149        m = (uintptr_t)mem;
2150#else /* __rtems__ */
2151        m = (uintptr_t)rtems_heap_allocate_aligned_with_boundary(
2152            size, CACHE_LINE_SIZE, 0);
2153        BSD_ASSERT(m != 0);
2154        memset((void *)m, 0, size);
2155#endif /* __rtems__ */
2156        zones = (uma_zone_t)m;
2157        m += roundup(zsize, CACHE_LINE_SIZE);
2158        kegs = (uma_zone_t)m;
2159        m += roundup(zsize, CACHE_LINE_SIZE);
2160        masterkeg = (uma_keg_t)m;
2161#ifndef __rtems__
2162        m += roundup(ksize, CACHE_LINE_SIZE);
2163        m = roundup(m, PAGE_SIZE);
2164        npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2165        mem = (void *)m;
2166#endif /* __rtems__ */
2167
2168        /* "manually" create the initial zone */
2169        memset(&args, 0, sizeof(args));
2170        args.name = "UMA Kegs";
2171        args.size = ksize;
2172        args.ctor = keg_ctor;
2173        args.dtor = keg_dtor;
2174        args.uminit = zero_init;
2175        args.fini = NULL;
2176        args.keg = masterkeg;
2177        args.align = UMA_BOOT_ALIGN - 1;
2178        args.flags = UMA_ZFLAG_INTERNAL;
2179        zone_ctor(kegs, zsize, &args, M_WAITOK);
2180
2181#ifndef __rtems__
2182        bootmem = mem;
2183        boot_pages = npages;
2184#endif /* __rtems__ */
2185
2186        args.name = "UMA Zones";
2187        args.size = zsize;
2188        args.ctor = zone_ctor;
2189        args.dtor = zone_dtor;
2190        args.uminit = zero_init;
2191        args.fini = NULL;
2192        args.keg = NULL;
2193        args.align = UMA_BOOT_ALIGN - 1;
2194        args.flags = UMA_ZFLAG_INTERNAL;
2195        zone_ctor(zones, zsize, &args, M_WAITOK);
2196
2197        /* Now make a zone for slab headers */
2198        slabzone = uma_zcreate("UMA Slabs",
2199                                sizeof(struct uma_slab),
2200                                NULL, NULL, NULL, NULL,
2201                                UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2202
2203        hashzone = uma_zcreate("UMA Hash",
2204            sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2205            NULL, NULL, NULL, NULL,
2206            UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2207
2208        bucket_init();
2209
2210#ifndef __rtems__
2211        booted = BOOT_STRAPPED;
2212#endif /* __rtems__ */
2213}
2214
2215#ifndef __rtems__
2216void
2217uma_startup1(void)
2218{
2219
2220#ifdef DIAGNOSTIC
2221        printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2222#endif
2223        booted = BOOT_PAGEALLOC;
2224}
2225
2226void
2227uma_startup2(void)
2228{
2229
2230#ifdef DIAGNOSTIC
2231        printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2232#endif
2233        booted = BOOT_BUCKETS;
2234        sx_init(&uma_drain_lock, "umadrain");
2235        bucket_enable();
2236}
2237#endif /* __rtems__ */
2238
2239/*
2240 * Initialize our callout handle
2241 *
2242 */
2243static void
2244uma_startup3(void)
2245{
2246
2247#ifdef INVARIANTS
2248        TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2249        uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2250        uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2251#endif
2252        callout_init(&uma_callout, 1);
2253        callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2254#ifndef __rtems__
2255        booted = BOOT_RUNNING;
2256#endif /* __rtems__ */
2257}
2258
2259static uma_keg_t
2260uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2261                int align, uint32_t flags)
2262{
2263        struct uma_kctor_args args;
2264
2265        args.size = size;
2266        args.uminit = uminit;
2267        args.fini = fini;
2268        args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2269        args.flags = flags;
2270        args.zone = zone;
2271        return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2272}
2273
2274/* Public functions */
2275/* See uma.h */
2276void
2277uma_set_align(int align)
2278{
2279
2280        if (align != UMA_ALIGN_CACHE)
2281                uma_align_cache = align;
2282}
2283
2284/* See uma.h */
2285uma_zone_t
2286uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2287                uma_init uminit, uma_fini fini, int align, uint32_t flags)
2288
2289{
2290        struct uma_zctor_args args;
2291        uma_zone_t res;
2292#ifndef __rtems__
2293        bool locked;
2294#endif /* __rtems__ */
2295
2296        KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2297            align, name));
2298
2299        /* This stuff is essential for the zone ctor */
2300        memset(&args, 0, sizeof(args));
2301        args.name = name;
2302        args.size = size;
2303        args.ctor = ctor;
2304        args.dtor = dtor;
2305        args.uminit = uminit;
2306        args.fini = fini;
2307#ifdef  INVARIANTS
2308        /*
2309         * If a zone is being created with an empty constructor and
2310         * destructor, pass UMA constructor/destructor which checks for
2311         * memory use after free.
2312         */
2313        if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2314            ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2315                args.ctor = trash_ctor;
2316                args.dtor = trash_dtor;
2317                args.uminit = trash_init;
2318                args.fini = trash_fini;
2319        }
2320#endif
2321        args.align = align;
2322        args.flags = flags;
2323        args.keg = NULL;
2324
2325#ifndef __rtems__
2326        if (booted < BOOT_BUCKETS) {
2327                locked = false;
2328        } else {
2329#endif /* __rtems__ */
2330                sx_slock(&uma_drain_lock);
2331#ifndef __rtems__
2332                locked = true;
2333        }
2334#endif /* __rtems__ */
2335        res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2336#ifndef __rtems__
2337        if (locked)
2338#endif /* __rtems__ */
2339                sx_sunlock(&uma_drain_lock);
2340        return (res);
2341}
2342
2343/* See uma.h */
2344uma_zone_t
2345uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2346                    uma_init zinit, uma_fini zfini, uma_zone_t master)
2347{
2348        struct uma_zctor_args args;
2349        uma_keg_t keg;
2350        uma_zone_t res;
2351#ifndef __rtems__
2352        bool locked;
2353#endif /* __rtems__ */
2354
2355        keg = zone_first_keg(master);
2356        memset(&args, 0, sizeof(args));
2357        args.name = name;
2358        args.size = keg->uk_size;
2359        args.ctor = ctor;
2360        args.dtor = dtor;
2361        args.uminit = zinit;
2362        args.fini = zfini;
2363        args.align = keg->uk_align;
2364        args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2365        args.keg = keg;
2366
2367#ifndef __rtems__
2368        if (booted < BOOT_BUCKETS) {
2369                locked = false;
2370        } else {
2371#endif /* __rtems__ */
2372                sx_slock(&uma_drain_lock);
2373#ifndef __rtems__
2374                locked = true;
2375        }
2376#endif /* __rtems__ */
2377        /* XXX Attaches only one keg of potentially many. */
2378        res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2379#ifndef __rtems__
2380        if (locked)
2381#endif /* __rtems__ */
2382                sx_sunlock(&uma_drain_lock);
2383        return (res);
2384}
2385
2386/* See uma.h */
2387uma_zone_t
2388uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2389                    uma_init zinit, uma_fini zfini, uma_import zimport,
2390                    uma_release zrelease, void *arg, int flags)
2391{
2392        struct uma_zctor_args args;
2393
2394        memset(&args, 0, sizeof(args));
2395        args.name = name;
2396        args.size = size;
2397        args.ctor = ctor;
2398        args.dtor = dtor;
2399        args.uminit = zinit;
2400        args.fini = zfini;
2401        args.import = zimport;
2402        args.release = zrelease;
2403        args.arg = arg;
2404        args.align = 0;
2405        args.flags = flags;
2406
2407        return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2408}
2409
2410#ifndef __rtems__
2411static void
2412zone_lock_pair(uma_zone_t a, uma_zone_t b)
2413{
2414        if (a < b) {
2415                ZONE_LOCK(a);
2416                mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
2417        } else {
2418                ZONE_LOCK(b);
2419                mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
2420        }
2421}
2422
2423static void
2424zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2425{
2426
2427        ZONE_UNLOCK(a);
2428        ZONE_UNLOCK(b);
2429}
2430
2431int
2432uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2433{
2434        uma_klink_t klink;
2435        uma_klink_t kl;
2436        int error;
2437
2438        error = 0;
2439        klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2440
2441        zone_lock_pair(zone, master);
2442        /*
2443         * zone must use vtoslab() to resolve objects and must already be
2444         * a secondary.
2445         */
2446        if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2447            != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2448                error = EINVAL;
2449                goto out;
2450        }
2451        /*
2452         * The new master must also use vtoslab().
2453         */
2454        if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2455                error = EINVAL;
2456                goto out;
2457        }
2458
2459        /*
2460         * The underlying object must be the same size.  rsize
2461         * may be different.
2462         */
2463        if (master->uz_size != zone->uz_size) {
2464                error = E2BIG;
2465                goto out;
2466        }
2467        /*
2468         * Put it at the end of the list.
2469         */
2470        klink->kl_keg = zone_first_keg(master);
2471        LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2472                if (LIST_NEXT(kl, kl_link) == NULL) {
2473                        LIST_INSERT_AFTER(kl, klink, kl_link);
2474                        break;
2475                }
2476        }
2477        klink = NULL;
2478        zone->uz_flags |= UMA_ZFLAG_MULTI;
2479        zone->uz_slab = zone_fetch_slab_multi;
2480
2481out:
2482        zone_unlock_pair(zone, master);
2483        if (klink != NULL)
2484                free(klink, M_TEMP);
2485
2486        return (error);
2487}
2488#endif /* __rtems__ */
2489
2490
2491/* See uma.h */
2492void
2493uma_zdestroy(uma_zone_t zone)
2494{
2495
2496        sx_slock(&uma_drain_lock);
2497        zone_free_item(zones, zone, NULL, SKIP_NONE);
2498        sx_sunlock(&uma_drain_lock);
2499}
2500
2501void
2502uma_zwait(uma_zone_t zone)
2503{
2504        void *item;
2505
2506        item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2507        uma_zfree(zone, item);
2508}
2509
2510void *
2511uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2512{
2513        void *item;
2514#ifdef SMP
2515        int i;
2516
2517        MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2518#endif
2519        item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2520        if (item != NULL && (flags & M_ZERO)) {
2521#ifdef SMP
2522                for (i = 0; i <= mp_maxid; i++)
2523                        bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2524#else
2525                bzero(item, zone->uz_size);
2526#endif
2527        }
2528        return (item);
2529}
2530
2531/*
2532 * A stub while both regular and pcpu cases are identical.
2533 */
2534void
2535uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2536{
2537
2538#ifdef SMP
2539        MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2540#endif
2541        uma_zfree_arg(zone, item, udata);
2542}
2543
2544/* See uma.h */
2545void *
2546uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2547{
2548        uma_zone_domain_t zdom;
2549        uma_bucket_t bucket;
2550        uma_cache_t cache;
2551        void *item;
2552        int cpu, domain, lockfail;
2553#ifdef INVARIANTS
2554        bool skipdbg;
2555#endif
2556
2557        /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2558        random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2559
2560        /* This is the fast path allocation */
2561        CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2562            curthread, zone->uz_name, zone, flags);
2563
2564        if (flags & M_WAITOK) {
2565                WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2566                    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2567        }
2568#ifndef __rtems__
2569        KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2570        KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2571            ("uma_zalloc_arg: called with spinlock or critical section held"));
2572        if (zone->uz_flags & UMA_ZONE_PCPU)
2573                KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2574                    "with M_ZERO passed"));
2575#endif /* __rtems__ */
2576
2577#ifdef DEBUG_MEMGUARD
2578        if (memguard_cmp_zone(zone)) {
2579                item = memguard_alloc(zone->uz_size, flags);
2580                if (item != NULL) {
2581                        if (zone->uz_init != NULL &&
2582                            zone->uz_init(item, zone->uz_size, flags) != 0)
2583                                return (NULL);
2584                        if (zone->uz_ctor != NULL &&
2585                            zone->uz_ctor(item, zone->uz_size, udata,
2586                            flags) != 0) {
2587                                zone->uz_fini(item, zone->uz_size);
2588                                return (NULL);
2589                        }
2590                        return (item);
2591                }
2592                /* This is unfortunate but should not be fatal. */
2593        }
2594#endif
2595        /*
2596         * If possible, allocate from the per-CPU cache.  There are two
2597         * requirements for safe access to the per-CPU cache: (1) the thread
2598         * accessing the cache must not be preempted or yield during access,
2599         * and (2) the thread must not migrate CPUs without switching which
2600         * cache it accesses.  We rely on a critical section to prevent
2601         * preemption and migration.  We release the critical section in
2602         * order to acquire the zone mutex if we are unable to allocate from
2603         * the current cache; when we re-acquire the critical section, we
2604         * must detect and handle migration if it has occurred.
2605         */
2606zalloc_restart:
2607        critical_enter();
2608        cpu = curcpu;
2609        cache = &zone->uz_cpu[cpu];
2610
2611zalloc_start:
2612        bucket = cache->uc_allocbucket;
2613        if (bucket != NULL && bucket->ub_cnt > 0) {
2614                bucket->ub_cnt--;
2615                item = bucket->ub_bucket[bucket->ub_cnt];
2616#ifdef INVARIANTS
2617                bucket->ub_bucket[bucket->ub_cnt] = NULL;
2618#endif
2619                KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2620                cache->uc_allocs++;
2621                critical_exit();
2622#ifdef INVARIANTS
2623                skipdbg = uma_dbg_zskip(zone, item);
2624#endif
2625                if (zone->uz_ctor != NULL &&
2626#ifdef INVARIANTS
2627                    (!skipdbg || zone->uz_ctor != trash_ctor ||
2628                    zone->uz_dtor != trash_dtor) &&
2629#endif
2630                    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2631                        atomic_add_long(&zone->uz_fails, 1);
2632                        zone_free_item(zone, item, udata, SKIP_DTOR);
2633                        return (NULL);
2634                }
2635#ifdef INVARIANTS
2636                if (!skipdbg)
2637                        uma_dbg_alloc(zone, NULL, item);
2638#endif
2639                if (flags & M_ZERO)
2640                        uma_zero_item(item, zone);
2641                return (item);
2642        }
2643
2644        /*
2645         * We have run out of items in our alloc bucket.
2646         * See if we can switch with our free bucket.
2647         */
2648        bucket = cache->uc_freebucket;
2649        if (bucket != NULL && bucket->ub_cnt > 0) {
2650                CTR2(KTR_UMA,
2651                    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2652                    zone->uz_name, zone);
2653                cache->uc_freebucket = cache->uc_allocbucket;
2654                cache->uc_allocbucket = bucket;
2655                goto zalloc_start;
2656        }
2657
2658        /*
2659         * Discard any empty allocation bucket while we hold no locks.
2660         */
2661        bucket = cache->uc_allocbucket;
2662        cache->uc_allocbucket = NULL;
2663        critical_exit();
2664        if (bucket != NULL)
2665                bucket_free(zone, bucket, udata);
2666
2667#ifndef __rtems__
2668        if (zone->uz_flags & UMA_ZONE_NUMA) {
2669                domain = PCPU_GET(domain);
2670                if (VM_DOMAIN_EMPTY(domain))
2671                        domain = UMA_ANYDOMAIN;
2672        } else
2673#endif /* __rtems__ */
2674                domain = UMA_ANYDOMAIN;
2675
2676        /* Short-circuit for zones without buckets and low memory. */
2677        if (zone->uz_count == 0 || bucketdisable)
2678                goto zalloc_item;
2679
2680        /*
2681         * Attempt to retrieve the item from the per-CPU cache has failed, so
2682         * we must go back to the zone.  This requires the zone lock, so we
2683         * must drop the critical section, then re-acquire it when we go back
2684         * to the cache.  Since the critical section is released, we may be
2685         * preempted or migrate.  As such, make sure not to maintain any
2686         * thread-local state specific to the cache from prior to releasing
2687         * the critical section.
2688         */
2689        lockfail = 0;
2690        if (ZONE_TRYLOCK(zone) == 0) {
2691                /* Record contention to size the buckets. */
2692                ZONE_LOCK(zone);
2693                lockfail = 1;
2694        }
2695        critical_enter();
2696        cpu = curcpu;
2697        cache = &zone->uz_cpu[cpu];
2698
2699        /* See if we lost the race to fill the cache. */
2700        if (cache->uc_allocbucket != NULL) {
2701                ZONE_UNLOCK(zone);
2702                goto zalloc_start;
2703        }
2704
2705        /*
2706         * Check the zone's cache of buckets.
2707         */
2708        if (domain == UMA_ANYDOMAIN)
2709                zdom = &zone->uz_domain[0];
2710        else
2711                zdom = &zone->uz_domain[domain];
2712        if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
2713                KASSERT(bucket->ub_cnt != 0,
2714                    ("uma_zalloc_arg: Returning an empty bucket."));
2715                cache->uc_allocbucket = bucket;
2716                ZONE_UNLOCK(zone);
2717                goto zalloc_start;
2718        }
2719        /* We are no longer associated with this CPU. */
2720        critical_exit();
2721
2722        /*
2723         * We bump the uz count when the cache size is insufficient to
2724         * handle the working set.
2725         */
2726        if (lockfail && zone->uz_count < BUCKET_MAX)
2727                zone->uz_count++;
2728        ZONE_UNLOCK(zone);
2729
2730        /*
2731         * Now lets just fill a bucket and put it on the free list.  If that
2732         * works we'll restart the allocation from the beginning and it
2733         * will use the just filled bucket.
2734         */
2735        bucket = zone_alloc_bucket(zone, udata, domain, flags);
2736        CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2737            zone->uz_name, zone, bucket);
2738        if (bucket != NULL) {
2739                ZONE_LOCK(zone);
2740                critical_enter();
2741                cpu = curcpu;
2742                cache = &zone->uz_cpu[cpu];
2743
2744                /*
2745                 * See if we lost the race or were migrated.  Cache the
2746                 * initialized bucket to make this less likely or claim
2747                 * the memory directly.
2748                 */
2749#ifndef __rtems__
2750                if (cache->uc_allocbucket == NULL &&
2751                    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2752                    domain == PCPU_GET(domain))) {
2753#else /* __rtems__ */
2754                if (cache->uc_allocbucket == NULL) {
2755#endif /* __rtems__ */
2756                        cache->uc_allocbucket = bucket;
2757                        zdom->uzd_imax += bucket->ub_cnt;
2758                } else if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
2759                        critical_exit();
2760                        ZONE_UNLOCK(zone);
2761                        bucket_drain(zone, bucket);
2762                        bucket_free(zone, bucket, udata);
2763                        goto zalloc_restart;
2764                } else
2765                        zone_put_bucket(zone, zdom, bucket, false);
2766                ZONE_UNLOCK(zone);
2767                goto zalloc_start;
2768        }
2769
2770        /*
2771         * We may not be able to get a bucket so return an actual item.
2772         */
2773zalloc_item:
2774        item = zone_alloc_item(zone, udata, domain, flags);
2775
2776        return (item);
2777}
2778
2779void *
2780uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2781{
2782
2783        /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2784        random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2785
2786        /* This is the fast path allocation */
2787        CTR5(KTR_UMA,
2788            "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2789            curthread, zone->uz_name, zone, domain, flags);
2790
2791        if (flags & M_WAITOK) {
2792                WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2793                    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2794        }
2795        KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2796            ("uma_zalloc_domain: called with spinlock or critical section held"));
2797
2798        return (zone_alloc_item(zone, udata, domain, flags));
2799}
2800
2801/*
2802 * Find a slab with some space.  Prefer slabs that are partially used over those
2803 * that are totally full.  This helps to reduce fragmentation.
2804 *
2805 * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2806 * only 'domain'.
2807 */
2808static uma_slab_t
2809keg_first_slab(uma_keg_t keg, int domain, bool rr)
2810{
2811        uma_domain_t dom;
2812        uma_slab_t slab;
2813        int start;
2814
2815        KASSERT(domain >= 0 && domain < vm_ndomains,
2816            ("keg_first_slab: domain %d out of range", domain));
2817
2818        slab = NULL;
2819        start = domain;
2820        do {
2821                dom = &keg->uk_domain[domain];
2822                if (!LIST_EMPTY(&dom->ud_part_slab))
2823                        return (LIST_FIRST(&dom->ud_part_slab));
2824                if (!LIST_EMPTY(&dom->ud_free_slab)) {
2825                        slab = LIST_FIRST(&dom->ud_free_slab);
2826                        LIST_REMOVE(slab, us_link);
2827                        LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2828                        return (slab);
2829                }
2830#ifndef __rtems__
2831                if (rr)
2832                        domain = (domain + 1) % vm_ndomains;
2833#endif /* __rtems__ */
2834        } while (domain != start);
2835
2836        return (NULL);
2837}
2838
2839static uma_slab_t
2840keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2841{
2842        uint32_t reserve;
2843
2844        mtx_assert(&keg->uk_lock, MA_OWNED);
2845
2846        reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2847        if (keg->uk_free <= reserve)
2848                return (NULL);
2849        return (keg_first_slab(keg, domain, rr));
2850}
2851
2852static uma_slab_t
2853keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2854{
2855#ifndef __rtems__
2856        struct vm_domainset_iter di;
2857#endif /* __rtems__ */
2858        uma_domain_t dom;
2859        uma_slab_t slab;
2860        int aflags, domain;
2861        bool rr;
2862
2863#ifndef __rtems__
2864restart:
2865#endif /* __rtems__ */
2866        mtx_assert(&keg->uk_lock, MA_OWNED);
2867
2868        /*
2869         * Use the keg's policy if upper layers haven't already specified a
2870         * domain (as happens with first-touch zones).
2871         *
2872         * To avoid races we run the iterator with the keg lock held, but that
2873         * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2874         * clear M_WAITOK and handle low memory conditions locally.
2875         */
2876#ifndef __rtems__
2877        rr = rdomain == UMA_ANYDOMAIN;
2878        if (rr) {
2879                aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2880                vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2881                    &aflags);
2882        } else {
2883                aflags = flags;
2884                domain = rdomain;
2885        }
2886#else /* __rtems__ */
2887        rr = true;
2888        aflags = flags;
2889        domain = 0;
2890#endif /* __rtems__ */
2891
2892        for (;;) {
2893                slab = keg_fetch_free_slab(keg, domain, rr, flags);
2894                if (slab != NULL) {
2895                        MPASS(slab->us_keg == keg);
2896                        return (slab);
2897                }
2898
2899                /*
2900                 * M_NOVM means don't ask at all!
2901                 */
2902                if (flags & M_NOVM)
2903                        break;
2904
2905                if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2906                        keg->uk_flags |= UMA_ZFLAG_FULL;
2907                        /*
2908                         * If this is not a multi-zone, set the FULL bit.
2909                         * Otherwise slab_multi() takes care of it.
2910                         */
2911                        if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2912                                zone->uz_flags |= UMA_ZFLAG_FULL;
2913                                zone_log_warning(zone);
2914                                zone_maxaction(zone);
2915                        }
2916                        if (flags & M_NOWAIT)
2917                                return (NULL);
2918                        zone->uz_sleeps++;
2919                        msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2920                        continue;
2921                }
2922                slab = keg_alloc_slab(keg, zone, domain, aflags);
2923                /*
2924                 * If we got a slab here it's safe to mark it partially used
2925                 * and return.  We assume that the caller is going to remove
2926                 * at least one item.
2927                 */
2928                if (slab) {
2929                        MPASS(slab->us_keg == keg);
2930                        dom = &keg->uk_domain[slab->us_domain];
2931                        LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2932                        return (slab);
2933                }
2934                KEG_LOCK(keg);
2935#ifndef __rtems__
2936                if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2937                        if ((flags & M_WAITOK) != 0) {
2938                                KEG_UNLOCK(keg);
2939                                vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
2940                                KEG_LOCK(keg);
2941                                goto restart;
2942                        }
2943                        break;
2944                }
2945#else /* __rtems__ */
2946                return (NULL);
2947#endif /* __rtems__ */
2948        }
2949
2950        /*
2951         * We might not have been able to get a slab but another cpu
2952         * could have while we were unlocked.  Check again before we
2953         * fail.
2954         */
2955        if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2956                MPASS(slab->us_keg == keg);
2957                return (slab);
2958        }
2959        return (NULL);
2960}
2961
2962static uma_slab_t
2963zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2964{
2965        uma_slab_t slab;
2966
2967        if (keg == NULL) {
2968                keg = zone_first_keg(zone);
2969                KEG_LOCK(keg);
2970        }
2971
2972        for (;;) {
2973                slab = keg_fetch_slab(keg, zone, domain, flags);
2974                if (slab)
2975                        return (slab);
2976                if (flags & (M_NOWAIT | M_NOVM))
2977                        break;
2978        }
2979        KEG_UNLOCK(keg);
2980        return (NULL);
2981}
2982
2983#ifndef __rtems__
2984/*
2985 * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2986 * with the keg locked.  On NULL no lock is held.
2987 *
2988 * The last pointer is used to seed the search.  It is not required.
2989 */
2990static uma_slab_t
2991zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
2992{
2993        uma_klink_t klink;
2994        uma_slab_t slab;
2995        uma_keg_t keg;
2996        int flags;
2997        int empty;
2998        int full;
2999
3000        /*
3001         * Don't wait on the first pass.  This will skip limit tests
3002         * as well.  We don't want to block if we can find a provider
3003         * without blocking.
3004         */
3005        flags = (rflags & ~M_WAITOK) | M_NOWAIT;
3006        /*
3007         * Use the last slab allocated as a hint for where to start
3008         * the search.
3009         */
3010        if (last != NULL) {
3011                slab = keg_fetch_slab(last, zone, domain, flags);
3012                if (slab)
3013                        return (slab);
3014                KEG_UNLOCK(last);
3015        }
3016        /*
3017         * Loop until we have a slab incase of transient failures
3018         * while M_WAITOK is specified.  I'm not sure this is 100%
3019         * required but we've done it for so long now.
3020         */
3021        for (;;) {
3022                empty = 0;
3023                full = 0;
3024                /*
3025                 * Search the available kegs for slabs.  Be careful to hold the
3026                 * correct lock while calling into the keg layer.
3027                 */
3028                LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
3029                        keg = klink->kl_keg;
3030                        KEG_LOCK(keg);
3031                        if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
3032                                slab = keg_fetch_slab(keg, zone, domain, flags);
3033                                if (slab)
3034                                        return (slab);
3035                        }
3036                        if (keg->uk_flags & UMA_ZFLAG_FULL)
3037                                full++;
3038                        else
3039                                empty++;
3040                        KEG_UNLOCK(keg);
3041                }
3042                if (rflags & (M_NOWAIT | M_NOVM))
3043                        break;
3044                flags = rflags;
3045                /*
3046                 * All kegs are full.  XXX We can't atomically check all kegs
3047                 * and sleep so just sleep for a short period and retry.
3048                 */
3049                if (full && !empty) {
3050                        ZONE_LOCK(zone);
3051                        zone->uz_flags |= UMA_ZFLAG_FULL;
3052                        zone->uz_sleeps++;
3053                        zone_log_warning(zone);
3054                        zone_maxaction(zone);
3055                        msleep(zone, zone->uz_lockptr, PVM,
3056                            "zonelimit", hz/100);
3057                        zone->uz_flags &= ~UMA_ZFLAG_FULL;
3058                        ZONE_UNLOCK(zone);
3059                        continue;
3060                }
3061        }
3062        return (NULL);
3063}
3064#endif /* __rtems__ */
3065
3066static void *
3067slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
3068{
3069        uma_domain_t dom;
3070        void *item;
3071        uint8_t freei;
3072
3073        MPASS(keg == slab->us_keg);
3074        mtx_assert(&keg->uk_lock, MA_OWNED);
3075
3076        freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
3077        BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
3078        item = slab->us_data + (keg->uk_rsize * freei);
3079        slab->us_freecount--;
3080        keg->uk_free--;
3081
3082        /* Move this slab to the full list */
3083        if (slab->us_freecount == 0) {
3084                LIST_REMOVE(slab, us_link);
3085                dom = &keg->uk_domain[slab->us_domain];
3086                LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
3087        }
3088
3089        return (item);
3090}
3091
3092static int
3093zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
3094{
3095        uma_slab_t slab;
3096        uma_keg_t keg;
3097#ifdef NUMA
3098        int stripe;
3099#endif
3100        int i;
3101
3102        slab = NULL;
3103        keg = NULL;
3104        /* Try to keep the buckets totally full */
3105        for (i = 0; i < max; ) {
3106                if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
3107                        break;
3108                keg = slab->us_keg;
3109#ifdef NUMA
3110                stripe = howmany(max, vm_ndomains);
3111#endif
3112                while (slab->us_freecount && i < max) {
3113                        bucket[i++] = slab_alloc_item(keg, slab);
3114                        if (keg->uk_free <= keg->uk_reserve)
3115                                break;
3116#ifdef NUMA
3117                        /*
3118                         * If the zone is striped we pick a new slab for every
3119                         * N allocations.  Eliminating this conditional will
3120                         * instead pick a new domain for each bucket rather
3121                         * than stripe within each bucket.  The current option
3122                         * produces more fragmentation and requires more cpu
3123                         * time but yields better distribution.
3124                         */
3125                        if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
3126                            vm_ndomains > 1 && --stripe == 0)
3127                                break;
3128#endif
3129                }
3130                /* Don't block if we allocated any successfully. */
3131                flags &= ~M_WAITOK;
3132                flags |= M_NOWAIT;
3133        }
3134        if (slab != NULL)
3135                KEG_UNLOCK(keg);
3136
3137        return i;
3138}
3139
3140static uma_bucket_t
3141zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
3142{
3143        uma_bucket_t bucket;
3144        int max;
3145
3146        CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
3147
3148        /* Don't wait for buckets, preserve caller's NOVM setting. */
3149        bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
3150        if (bucket == NULL)
3151                return (NULL);
3152
3153        max = MIN(bucket->ub_entries, zone->uz_count);
3154        bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
3155            max, domain, flags);
3156
3157        /*
3158         * Initialize the memory if necessary.
3159         */
3160        if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
3161                int i;
3162
3163                for (i = 0; i < bucket->ub_cnt; i++)
3164                        if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
3165                            flags) != 0)
3166                                break;
3167                /*
3168                 * If we couldn't initialize the whole bucket, put the
3169                 * rest back onto the freelist.
3170                 */
3171                if (i != bucket->ub_cnt) {
3172                        zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
3173                            bucket->ub_cnt - i);
3174#ifdef INVARIANTS
3175                        bzero(&bucket->ub_bucket[i],
3176                            sizeof(void *) * (bucket->ub_cnt - i));
3177#endif
3178                        bucket->ub_cnt = i;
3179                }
3180        }
3181
3182        if (bucket->ub_cnt == 0) {
3183                bucket_free(zone, bucket, udata);
3184                atomic_add_long(&zone->uz_fails, 1);
3185                return (NULL);
3186        }
3187
3188        return (bucket);
3189}
3190
3191/*
3192 * Allocates a single item from a zone.
3193 *
3194 * Arguments
3195 *      zone   The zone to alloc for.
3196 *      udata  The data to be passed to the constructor.
3197 *      domain The domain to allocate from or UMA_ANYDOMAIN.
3198 *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
3199 *
3200 * Returns
3201 *      NULL if there is no memory and M_NOWAIT is set
3202 *      An item if successful
3203 */
3204
3205static void *
3206zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3207{
3208        void *item;
3209#ifdef INVARIANTS
3210        bool skipdbg;
3211#endif
3212
3213        item = NULL;
3214
3215#ifndef __rtems__
3216        if (domain != UMA_ANYDOMAIN) {
3217                /* avoid allocs targeting empty domains */
3218                if (VM_DOMAIN_EMPTY(domain))
3219                        domain = UMA_ANYDOMAIN;
3220        }
3221#endif /* __rtems__ */
3222        if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3223                goto fail;
3224        atomic_add_long(&zone->uz_allocs, 1);
3225
3226#ifdef INVARIANTS
3227        skipdbg = uma_dbg_zskip(zone, item);
3228#endif
3229        /*
3230         * We have to call both the zone's init (not the keg's init)
3231         * and the zone's ctor.  This is because the item is going from
3232         * a keg slab directly to the user, and the user is expecting it
3233         * to be both zone-init'd as well as zone-ctor'd.
3234         */
3235        if (zone->uz_init != NULL) {
3236                if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3237                        zone_free_item(zone, item, udata, SKIP_FINI);
3238                        goto fail;
3239                }
3240        }
3241        if (zone->uz_ctor != NULL &&
3242#ifdef INVARIANTS
3243            (!skipdbg || zone->uz_ctor != trash_ctor ||
3244            zone->uz_dtor != trash_dtor) &&
3245#endif
3246            zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3247                zone_free_item(zone, item, udata, SKIP_DTOR);
3248                goto fail;
3249        }
3250#ifdef INVARIANTS
3251        if (!skipdbg)
3252                uma_dbg_alloc(zone, NULL, item);
3253#endif
3254        if (flags & M_ZERO)
3255                uma_zero_item(item, zone);
3256
3257        CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3258            zone->uz_name, zone);
3259
3260        return (item);
3261
3262fail:
3263        CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3264            zone->uz_name, zone);
3265        atomic_add_long(&zone->uz_fails, 1);
3266        return (NULL);
3267}
3268
3269/* See uma.h */
3270void
3271uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3272{
3273        uma_cache_t cache;
3274        uma_bucket_t bucket;
3275        uma_zone_domain_t zdom;
3276#ifndef __rtems__
3277        int cpu, domain, lockfail;
3278#else /* __rtems__ */
3279        int cpu, lockfail;
3280#endif /* __rtems__ */
3281#ifdef INVARIANTS
3282        bool skipdbg;
3283#endif
3284
3285        /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3286        random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3287
3288        CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3289            zone->uz_name);
3290
3291#ifndef __rtems__
3292        KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3293            ("uma_zfree_arg: called with spinlock or critical section held"));
3294#endif /* __rtems__ */
3295
3296        /* uma_zfree(..., NULL) does nothing, to match free(9). */
3297        if (item == NULL)
3298                return;
3299#ifdef DEBUG_MEMGUARD
3300        if (is_memguard_addr(item)) {
3301                if (zone->uz_dtor != NULL)
3302                        zone->uz_dtor(item, zone->uz_size, udata);
3303                if (zone->uz_fini != NULL)
3304                        zone->uz_fini(item, zone->uz_size);
3305                memguard_free(item);
3306                return;
3307        }
3308#endif
3309#ifdef INVARIANTS
3310        skipdbg = uma_dbg_zskip(zone, item);
3311        if (skipdbg == false) {
3312                if (zone->uz_flags & UMA_ZONE_MALLOC)
3313                        uma_dbg_free(zone, udata, item);
3314                else
3315                        uma_dbg_free(zone, NULL, item);
3316        }
3317        if (zone->uz_dtor != NULL && (!skipdbg ||
3318            zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3319#else
3320        if (zone->uz_dtor != NULL)
3321#endif
3322                zone->uz_dtor(item, zone->uz_size, udata);
3323
3324        /*
3325         * The race here is acceptable.  If we miss it we'll just have to wait
3326         * a little longer for the limits to be reset.
3327         */
3328        if (zone->uz_flags & UMA_ZFLAG_FULL)
3329                goto zfree_item;
3330
3331        /*
3332         * If possible, free to the per-CPU cache.  There are two
3333         * requirements for safe access to the per-CPU cache: (1) the thread
3334         * accessing the cache must not be preempted or yield during access,
3335         * and (2) the thread must not migrate CPUs without switching which
3336         * cache it accesses.  We rely on a critical section to prevent
3337         * preemption and migration.  We release the critical section in
3338         * order to acquire the zone mutex if we are unable to free to the
3339         * current cache; when we re-acquire the critical section, we must
3340         * detect and handle migration if it has occurred.
3341         */
3342zfree_restart:
3343        critical_enter();
3344        cpu = curcpu;
3345        cache = &zone->uz_cpu[cpu];
3346
3347zfree_start:
3348        /*
3349         * Try to free into the allocbucket first to give LIFO ordering
3350         * for cache-hot datastructures.  Spill over into the freebucket
3351         * if necessary.  Alloc will swap them if one runs dry.
3352         */
3353        bucket = cache->uc_allocbucket;
3354        if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3355                bucket = cache->uc_freebucket;
3356        if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3357                KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3358                    ("uma_zfree: Freeing to non free bucket index."));
3359                bucket->ub_bucket[bucket->ub_cnt] = item;
3360                bucket->ub_cnt++;
3361                cache->uc_frees++;
3362                critical_exit();
3363                return;
3364        }
3365
3366        /*
3367         * We must go back the zone, which requires acquiring the zone lock,
3368         * which in turn means we must release and re-acquire the critical
3369         * section.  Since the critical section is released, we may be
3370         * preempted or migrate.  As such, make sure not to maintain any
3371         * thread-local state specific to the cache from prior to releasing
3372         * the critical section.
3373         */
3374        critical_exit();
3375        if (zone->uz_count == 0 || bucketdisable)
3376                goto zfree_item;
3377
3378        lockfail = 0;
3379        if (ZONE_TRYLOCK(zone) == 0) {
3380                /* Record contention to size the buckets. */
3381                ZONE_LOCK(zone);
3382                lockfail = 1;
3383        }
3384        critical_enter();
3385        cpu = curcpu;
3386        cache = &zone->uz_cpu[cpu];
3387
3388        bucket = cache->uc_freebucket;
3389        if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3390                ZONE_UNLOCK(zone);
3391                goto zfree_start;
3392        }
3393        cache->uc_freebucket = NULL;
3394        /* We are no longer associated with this CPU. */
3395        critical_exit();
3396
3397#ifndef __rtems__
3398        if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3399                domain = PCPU_GET(domain);
3400                if (VM_DOMAIN_EMPTY(domain))
3401                        domain = UMA_ANYDOMAIN;
3402        } else
3403                domain = 0;
3404#endif /* __rtems__ */
3405        zdom = &zone->uz_domain[0];
3406
3407        /* Can we throw this on the zone full list? */
3408        if (bucket != NULL) {
3409                CTR3(KTR_UMA,
3410                    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3411                    zone->uz_name, zone, bucket);
3412                /* ub_cnt is pointing to the last free item */
3413                KASSERT(bucket->ub_cnt != 0,
3414                    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
3415                if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
3416                        ZONE_UNLOCK(zone);
3417                        bucket_drain(zone, bucket);
3418                        bucket_free(zone, bucket, udata);
3419                        goto zfree_restart;
3420                } else
3421                        zone_put_bucket(zone, zdom, bucket, true);
3422        }
3423
3424        /*
3425         * We bump the uz count when the cache size is insufficient to
3426         * handle the working set.
3427         */
3428        if (lockfail && zone->uz_count < BUCKET_MAX)
3429                zone->uz_count++;
3430        ZONE_UNLOCK(zone);
3431
3432        bucket = bucket_alloc(zone, udata, M_NOWAIT);
3433        CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3434            zone->uz_name, zone, bucket);
3435        if (bucket) {
3436                critical_enter();
3437                cpu = curcpu;
3438                cache = &zone->uz_cpu[cpu];
3439#ifndef __rtems__
3440                if (cache->uc_freebucket == NULL &&
3441                    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3442                    domain == PCPU_GET(domain))) {
3443#else /* __rtems__ */
3444                if (cache->uc_freebucket == NULL) {
3445#endif /* __rtems__ */
3446                        cache->uc_freebucket = bucket;
3447                        goto zfree_start;
3448                }
3449                /*
3450                 * We lost the race, start over.  We have to drop our
3451                 * critical section to free the bucket.
3452                 */
3453                critical_exit();
3454                bucket_free(zone, bucket, udata);
3455                goto zfree_restart;
3456        }
3457
3458        /*
3459         * If nothing else caught this, we'll just do an internal free.
3460         */
3461zfree_item:
3462        zone_free_item(zone, item, udata, SKIP_DTOR);
3463
3464        return;
3465}
3466
3467void
3468uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3469{
3470
3471        /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3472        random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3473
3474        CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3475            zone->uz_name);
3476
3477        KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3478            ("uma_zfree_domain: called with spinlock or critical section held"));
3479
3480        /* uma_zfree(..., NULL) does nothing, to match free(9). */
3481        if (item == NULL)
3482                return;
3483        zone_free_item(zone, item, udata, SKIP_NONE);
3484}
3485
3486static void
3487slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
3488{
3489        uma_domain_t dom;
3490        uint8_t freei;
3491
3492        mtx_assert(&keg->uk_lock, MA_OWNED);
3493        MPASS(keg == slab->us_keg);
3494
3495        dom = &keg->uk_domain[slab->us_domain];
3496
3497        /* Do we need to remove from any lists? */
3498        if (slab->us_freecount+1 == keg->uk_ipers) {
3499                LIST_REMOVE(slab, us_link);
3500                LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3501        } else if (slab->us_freecount == 0) {
3502                LIST_REMOVE(slab, us_link);
3503                LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3504        }
3505
3506        /* Slab management. */
3507        freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3508        BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3509        slab->us_freecount++;
3510
3511        /* Keg statistics. */
3512        keg->uk_free++;
3513}
3514
3515static void
3516zone_release(uma_zone_t zone, void **bucket, int cnt)
3517{
3518        void *item;
3519        uma_slab_t slab;
3520        uma_keg_t keg;
3521        uint8_t *mem;
3522        int clearfull;
3523        int i;
3524
3525        clearfull = 0;
3526        keg = zone_first_keg(zone);
3527        KEG_LOCK(keg);
3528        for (i = 0; i < cnt; i++) {
3529                item = bucket[i];
3530                if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3531                        mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3532                        if (zone->uz_flags & UMA_ZONE_HASH) {
3533                                slab = hash_sfind(&keg->uk_hash, mem);
3534                        } else {
3535                                mem += keg->uk_pgoff;
3536                                slab = (uma_slab_t)mem;
3537                        }
3538                } else {
3539                        slab = vtoslab((vm_offset_t)item);
3540                        if (slab->us_keg != keg) {
3541                                KEG_UNLOCK(keg);
3542                                keg = slab->us_keg;
3543                                KEG_LOCK(keg);
3544                        }
3545                }
3546                slab_free_item(keg, slab, item);
3547                if (keg->uk_flags & UMA_ZFLAG_FULL) {
3548                        if (keg->uk_pages < keg->uk_maxpages) {
3549                                keg->uk_flags &= ~UMA_ZFLAG_FULL;
3550                                clearfull = 1;
3551                        }
3552
3553                        /*
3554                         * We can handle one more allocation. Since we're
3555                         * clearing ZFLAG_FULL, wake up all procs blocked
3556                         * on pages. This should be uncommon, so keeping this
3557                         * simple for now (rather than adding count of blocked
3558                         * threads etc).
3559                         */
3560                        wakeup(keg);
3561                }
3562        }
3563        KEG_UNLOCK(keg);
3564        if (clearfull) {
3565                ZONE_LOCK(zone);
3566                zone->uz_flags &= ~UMA_ZFLAG_FULL;
3567                wakeup(zone);
3568                ZONE_UNLOCK(zone);
3569        }
3570
3571}
3572
3573/*
3574 * Frees a single item to any zone.
3575 *
3576 * Arguments:
3577 *      zone   The zone to free to
3578 *      item   The item we're freeing
3579 *      udata  User supplied data for the dtor
3580 *      skip   Skip dtors and finis
3581 */
3582static void
3583zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3584{
3585#ifdef INVARIANTS
3586        bool skipdbg;
3587
3588        skipdbg = uma_dbg_zskip(zone, item);
3589        if (skip == SKIP_NONE && !skipdbg) {
3590                if (zone->uz_flags & UMA_ZONE_MALLOC)
3591                        uma_dbg_free(zone, udata, item);
3592                else
3593                        uma_dbg_free(zone, NULL, item);
3594        }
3595
3596        if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3597            (!skipdbg || zone->uz_dtor != trash_dtor ||
3598            zone->uz_ctor != trash_ctor))
3599#else
3600        if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3601#endif
3602                zone->uz_dtor(item, zone->uz_size, udata);
3603
3604        if (skip < SKIP_FINI && zone->uz_fini)
3605                zone->uz_fini(item, zone->uz_size);
3606
3607        atomic_add_long(&zone->uz_frees, 1);
3608        zone->uz_release(zone->uz_arg, &item, 1);
3609}
3610
3611/* See uma.h */
3612int
3613uma_zone_set_max(uma_zone_t zone, int nitems)
3614{
3615        uma_keg_t keg;
3616
3617        keg = zone_first_keg(zone);
3618        if (keg == NULL)
3619                return (0);
3620        KEG_LOCK(keg);
3621#ifdef __rtems__
3622#ifdef SMP
3623        /*
3624         * Ensure we have enough items to fill the per-processor caches.  This
3625         * is a heuristic approach and works not under all conditions.
3626         */
3627        nitems += 2 * BUCKET_MAX * (mp_maxid + 1);
3628#endif
3629#endif /* __rtems__ */
3630        keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
3631        if (keg->uk_maxpages * keg->uk_ipers < nitems)
3632                keg->uk_maxpages += keg->uk_ppera;
3633        nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3634        KEG_UNLOCK(keg);
3635
3636        return (nitems);
3637}
3638
3639/* See uma.h */
3640int
3641uma_zone_get_max(uma_zone_t zone)
3642{
3643        int nitems;
3644        uma_keg_t keg;
3645
3646        keg = zone_first_keg(zone);
3647        if (keg == NULL)
3648                return (0);
3649        KEG_LOCK(keg);
3650        nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3651        KEG_UNLOCK(keg);
3652
3653        return (nitems);
3654}
3655
3656/* See uma.h */
3657void
3658uma_zone_set_warning(uma_zone_t zone, const char *warning)
3659{
3660
3661        ZONE_LOCK(zone);
3662        zone->uz_warning = warning;
3663        ZONE_UNLOCK(zone);
3664}
3665
3666/* See uma.h */
3667void
3668uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3669{
3670
3671        ZONE_LOCK(zone);
3672        TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3673        ZONE_UNLOCK(zone);
3674}
3675
3676/* See uma.h */
3677int
3678uma_zone_get_cur(uma_zone_t zone)
3679{
3680        int64_t nitems;
3681        u_int i;
3682
3683        ZONE_LOCK(zone);
3684        nitems = zone->uz_allocs - zone->uz_frees;
3685        CPU_FOREACH(i) {
3686                /*
3687                 * See the comment in sysctl_vm_zone_stats() regarding the
3688                 * safety of accessing the per-cpu caches. With the zone lock
3689                 * held, it is safe, but can potentially result in stale data.
3690                 */
3691                nitems += zone->uz_cpu[i].uc_allocs -
3692                    zone->uz_cpu[i].uc_frees;
3693        }
3694        ZONE_UNLOCK(zone);
3695
3696        return (nitems < 0 ? 0 : nitems);
3697}
3698
3699/* See uma.h */
3700void
3701uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3702{
3703        uma_keg_t keg;
3704
3705        keg = zone_first_keg(zone);
3706        KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
3707        KEG_LOCK(keg);
3708        KASSERT(keg->uk_pages == 0,
3709            ("uma_zone_set_init on non-empty keg"));
3710        keg->uk_init = uminit;
3711        KEG_UNLOCK(keg);
3712}
3713
3714/* See uma.h */
3715void
3716uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3717{
3718        uma_keg_t keg;
3719
3720        keg = zone_first_keg(zone);
3721        KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3722        KEG_LOCK(keg);
3723        KASSERT(keg->uk_pages == 0,
3724            ("uma_zone_set_fini on non-empty keg"));
3725        keg->uk_fini = fini;
3726        KEG_UNLOCK(keg);
3727}
3728
3729/* See uma.h */
3730void
3731uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3732{
3733
3734        ZONE_LOCK(zone);
3735        KASSERT(zone_first_keg(zone)->uk_pages == 0,
3736            ("uma_zone_set_zinit on non-empty keg"));
3737        zone->uz_init = zinit;
3738        ZONE_UNLOCK(zone);
3739}
3740
3741/* See uma.h */
3742void
3743uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3744{
3745
3746        ZONE_LOCK(zone);
3747        KASSERT(zone_first_keg(zone)->uk_pages == 0,
3748            ("uma_zone_set_zfini on non-empty keg"));
3749        zone->uz_fini = zfini;
3750        ZONE_UNLOCK(zone);
3751}
3752
3753/* See uma.h */
3754/* XXX uk_freef is not actually used with the zone locked */
3755void
3756uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3757{
3758        uma_keg_t keg;
3759
3760        keg = zone_first_keg(zone);
3761        KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3762        KEG_LOCK(keg);
3763        keg->uk_freef = freef;
3764        KEG_UNLOCK(keg);
3765}
3766
3767/* See uma.h */
3768/* XXX uk_allocf is not actually used with the zone locked */
3769void
3770uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3771{
3772        uma_keg_t keg;
3773
3774        keg = zone_first_keg(zone);
3775        KEG_LOCK(keg);
3776        keg->uk_allocf = allocf;
3777        KEG_UNLOCK(keg);
3778}
3779
3780/* See uma.h */
3781void
3782uma_zone_reserve(uma_zone_t zone, int items)
3783{
3784        uma_keg_t keg;
3785
3786        keg = zone_first_keg(zone);
3787        if (keg == NULL)
3788                return;
3789        KEG_LOCK(keg);
3790        keg->uk_reserve = items;
3791        KEG_UNLOCK(keg);
3792
3793        return;
3794}
3795
3796#ifndef __rtems__
3797/* See uma.h */
3798int
3799uma_zone_reserve_kva(uma_zone_t zone, int count)
3800{
3801        uma_keg_t keg;
3802        vm_offset_t kva;
3803        u_int pages;
3804
3805        keg = zone_first_keg(zone);
3806        if (keg == NULL)
3807                return (0);
3808        pages = count / keg->uk_ipers;
3809
3810        if (pages * keg->uk_ipers < count)
3811                pages++;
3812        pages *= keg->uk_ppera;
3813
3814#ifdef UMA_MD_SMALL_ALLOC
3815        if (keg->uk_ppera > 1) {
3816#else
3817        if (1) {
3818#endif
3819                kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3820                if (kva == 0)
3821                        return (0);
3822        } else
3823                kva = 0;
3824        KEG_LOCK(keg);
3825        keg->uk_kva = kva;
3826        keg->uk_offset = 0;
3827        keg->uk_maxpages = pages;
3828#ifdef UMA_MD_SMALL_ALLOC
3829        keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3830#else
3831        keg->uk_allocf = noobj_alloc;
3832#endif
3833        keg->uk_flags |= UMA_ZONE_NOFREE;
3834        KEG_UNLOCK(keg);
3835
3836        return (1);
3837}
3838
3839/* See uma.h */
3840void
3841uma_prealloc(uma_zone_t zone, int items)
3842{
3843        struct vm_domainset_iter di;
3844        uma_domain_t dom;
3845        uma_slab_t slab;
3846        uma_keg_t keg;
3847        int domain, flags, slabs;
3848
3849        keg = zone_first_keg(zone);
3850        if (keg == NULL)
3851                return;
3852        KEG_LOCK(keg);
3853        slabs = items / keg->uk_ipers;
3854        if (slabs * keg->uk_ipers < items)
3855                slabs++;
3856        flags = M_WAITOK;
3857        vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &flags);
3858        while (slabs-- > 0) {
3859                slab = keg_alloc_slab(keg, zone, domain, flags);
3860                if (slab == NULL)
3861                        return;
3862                MPASS(slab->us_keg == keg);
3863                dom = &keg->uk_domain[slab->us_domain];
3864                LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3865                if (vm_domainset_iter_policy(&di, &domain) != 0)
3866                        break;
3867        }
3868        KEG_UNLOCK(keg);
3869}
3870#endif /* __rtems__ */
3871
3872/* See uma.h */
3873static void
3874uma_reclaim_locked(bool kmem_danger)
3875{
3876
3877        CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3878        sx_assert(&uma_drain_lock, SA_XLOCKED);
3879        bucket_enable();
3880        zone_foreach(zone_drain);
3881#ifndef __rtems__
3882        if (vm_page_count_min() || kmem_danger) {
3883                cache_drain_safe(NULL);
3884                zone_foreach(zone_drain);
3885        }
3886#endif /* __rtems__ */
3887
3888        /*
3889         * Some slabs may have been freed but this zone will be visited early
3890         * we visit again so that we can free pages that are empty once other
3891         * zones are drained.  We have to do the same for buckets.
3892         */
3893        zone_drain(slabzone);
3894        bucket_zone_drain();
3895}
3896
3897void
3898uma_reclaim(void)
3899{
3900
3901        sx_xlock(&uma_drain_lock);
3902        uma_reclaim_locked(false);
3903        sx_xunlock(&uma_drain_lock);
3904}
3905
3906static volatile int uma_reclaim_needed;
3907
3908void
3909uma_reclaim_wakeup(void)
3910{
3911
3912        if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3913                wakeup(uma_reclaim);
3914}
3915
3916void
3917uma_reclaim_worker(void *arg __unused)
3918{
3919
3920        for (;;) {
3921                sx_xlock(&uma_drain_lock);
3922                while (atomic_load_int(&uma_reclaim_needed) == 0)
3923                        sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
3924                            hz);
3925#ifndef __rtems__
3926                sx_xunlock(&uma_drain_lock);
3927                EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3928                sx_xlock(&uma_drain_lock);
3929#endif /* __rtems__ */
3930                uma_reclaim_locked(true);
3931                atomic_store_int(&uma_reclaim_needed, 0);
3932                sx_xunlock(&uma_drain_lock);
3933                /* Don't fire more than once per-second. */
3934                pause("umarclslp", hz);
3935        }
3936}
3937
3938/* See uma.h */
3939int
3940uma_zone_exhausted(uma_zone_t zone)
3941{
3942        int full;
3943
3944        ZONE_LOCK(zone);
3945        full = (zone->uz_flags & UMA_ZFLAG_FULL);
3946        ZONE_UNLOCK(zone);
3947        return (full); 
3948}
3949
3950int
3951uma_zone_exhausted_nolock(uma_zone_t zone)
3952{
3953        return (zone->uz_flags & UMA_ZFLAG_FULL);
3954}
3955
3956#ifndef __rtems__
3957void *
3958uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3959{
3960        struct domainset *policy;
3961        vm_offset_t addr;
3962        uma_slab_t slab;
3963
3964        if (domain != UMA_ANYDOMAIN) {
3965                /* avoid allocs targeting empty domains */
3966                if (VM_DOMAIN_EMPTY(domain))
3967                        domain = UMA_ANYDOMAIN;
3968        }
3969        slab = zone_alloc_item(slabzone, NULL, domain, wait);
3970        if (slab == NULL)
3971                return (NULL);
3972        policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3973            DOMAINSET_FIXED(domain);
3974        addr = kmem_malloc_domainset(policy, size, wait);
3975        if (addr != 0) {
3976                vsetslab(addr, slab);
3977                slab->us_data = (void *)addr;
3978                slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3979                slab->us_size = size;
3980                slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3981                    pmap_kextract(addr)));
3982                uma_total_inc(size);
3983        } else {
3984                zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3985        }
3986
3987        return ((void *)addr);
3988}
3989
3990void *
3991uma_large_malloc(vm_size_t size, int wait)
3992{
3993
3994        return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3995}
3996
3997void
3998uma_large_free(uma_slab_t slab)
3999{
4000
4001        KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
4002            ("uma_large_free:  Memory not allocated with uma_large_malloc."));
4003        kmem_free((vm_offset_t)slab->us_data, slab->us_size);
4004        uma_total_dec(slab->us_size);
4005        zone_free_item(slabzone, slab, NULL, SKIP_NONE);
4006}
4007#endif /* __rtems__ */
4008
4009static void
4010uma_zero_item(void *item, uma_zone_t zone)
4011{
4012
4013        bzero(item, zone->uz_size);
4014}
4015
4016unsigned long
4017uma_limit(void)
4018{
4019
4020        return (uma_kmem_limit);
4021}
4022
4023void
4024uma_set_limit(unsigned long limit)
4025{
4026
4027        uma_kmem_limit = limit;
4028}
4029
4030unsigned long
4031uma_size(void)
4032{
4033
4034        return (uma_kmem_total);
4035}
4036
4037long
4038uma_avail(void)
4039{
4040
4041        return (uma_kmem_limit - uma_kmem_total);
4042}
4043
4044void
4045uma_print_stats(void)
4046{
4047        zone_foreach(uma_print_zone);
4048}
4049
4050static void
4051slab_print(uma_slab_t slab)
4052{
4053        printf("slab: keg %p, data %p, freecount %d\n",
4054                slab->us_keg, slab->us_data, slab->us_freecount);
4055}
4056
4057static void
4058cache_print(uma_cache_t cache)
4059{
4060        printf("alloc: %p(%d), free: %p(%d)\n",
4061                cache->uc_allocbucket,
4062                cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
4063                cache->uc_freebucket,
4064                cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
4065}
4066
4067static void
4068uma_print_keg(uma_keg_t keg)
4069{
4070        uma_domain_t dom;
4071        uma_slab_t slab;
4072        int i;
4073
4074        printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
4075            "out %d free %d limit %d\n",
4076            keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
4077            keg->uk_ipers, keg->uk_ppera,
4078            (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
4079            keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
4080        for (i = 0; i < vm_ndomains; i++) {
4081                dom = &keg->uk_domain[i];
4082                printf("Part slabs:\n");
4083                LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
4084                        slab_print(slab);
4085                printf("Free slabs:\n");
4086                LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
4087                        slab_print(slab);
4088                printf("Full slabs:\n");
4089                LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
4090                        slab_print(slab);
4091        }
4092}
4093
4094void
4095uma_print_zone(uma_zone_t zone)
4096{
4097        uma_cache_t cache;
4098        uma_klink_t kl;
4099        int i;
4100
4101        printf("zone: %s(%p) size %d flags %#x\n",
4102            zone->uz_name, zone, zone->uz_size, zone->uz_flags);
4103        LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
4104                uma_print_keg(kl->kl_keg);
4105        CPU_FOREACH(i) {
4106                cache = &zone->uz_cpu[i];
4107                printf("CPU %d Cache:\n", i);
4108                cache_print(cache);
4109        }
4110}
4111
4112#ifndef __rtems__
4113#ifdef DDB
4114/*
4115 * Generate statistics across both the zone and its per-cpu cache's.  Return
4116 * desired statistics if the pointer is non-NULL for that statistic.
4117 *
4118 * Note: does not update the zone statistics, as it can't safely clear the
4119 * per-CPU cache statistic.
4120 *
4121 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
4122 * safe from off-CPU; we should modify the caches to track this information
4123 * directly so that we don't have to.
4124 */
4125static void
4126uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
4127    uint64_t *freesp, uint64_t *sleepsp)
4128{
4129        uma_cache_t cache;
4130        uint64_t allocs, frees, sleeps;
4131        int cachefree, cpu;
4132
4133        allocs = frees = sleeps = 0;
4134        cachefree = 0;
4135        CPU_FOREACH(cpu) {
4136                cache = &z->uz_cpu[cpu];
4137                if (cache->uc_allocbucket != NULL)
4138                        cachefree += cache->uc_allocbucket->ub_cnt;
4139                if (cache->uc_freebucket != NULL)
4140                        cachefree += cache->uc_freebucket->ub_cnt;
4141                allocs += cache->uc_allocs;
4142                frees += cache->uc_frees;
4143        }
4144        allocs += z->uz_allocs;
4145        frees += z->uz_frees;
4146        sleeps += z->uz_sleeps;
4147        if (cachefreep != NULL)
4148                *cachefreep = cachefree;
4149        if (allocsp != NULL)
4150                *allocsp = allocs;
4151        if (freesp != NULL)
4152                *freesp = frees;
4153        if (sleepsp != NULL)
4154                *sleepsp = sleeps;
4155}
4156#endif /* DDB */
4157#endif /* __rtems__ */
4158
4159static int
4160sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4161{
4162        uma_keg_t kz;
4163        uma_zone_t z;
4164        int count;
4165
4166        count = 0;
4167        rw_rlock(&uma_rwlock);
4168        LIST_FOREACH(kz, &uma_kegs, uk_link) {
4169                LIST_FOREACH(z, &kz->uk_zones, uz_link)
4170                        count++;
4171        }
4172        rw_runlock(&uma_rwlock);
4173        return (sysctl_handle_int(oidp, &count, 0, req));
4174}
4175
4176static int
4177sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4178{
4179        struct uma_stream_header ush;
4180        struct uma_type_header uth;
4181        struct uma_percpu_stat *ups;
4182        uma_zone_domain_t zdom;
4183        struct sbuf sbuf;
4184        uma_cache_t cache;
4185        uma_klink_t kl;
4186        uma_keg_t kz;
4187        uma_zone_t z;
4188        uma_keg_t k;
4189        int count, error, i;
4190
4191        error = sysctl_wire_old_buffer(req, 0);
4192        if (error != 0)
4193                return (error);
4194        sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4195        sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4196        ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4197
4198        count = 0;
4199        rw_rlock(&uma_rwlock);
4200        LIST_FOREACH(kz, &uma_kegs, uk_link) {
4201                LIST_FOREACH(z, &kz->uk_zones, uz_link)
4202                        count++;
4203        }
4204
4205        /*
4206         * Insert stream header.
4207         */
4208        bzero(&ush, sizeof(ush));
4209        ush.ush_version = UMA_STREAM_VERSION;
4210        ush.ush_maxcpus = (mp_maxid + 1);
4211        ush.ush_count = count;
4212        (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4213
4214        LIST_FOREACH(kz, &uma_kegs, uk_link) {
4215                LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4216                        bzero(&uth, sizeof(uth));
4217                        ZONE_LOCK(z);
4218                        strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4219                        uth.uth_align = kz->uk_align;
4220                        uth.uth_size = kz->uk_size;
4221                        uth.uth_rsize = kz->uk_rsize;
4222                        LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
4223                                k = kl->kl_keg;
4224                                uth.uth_maxpages += k->uk_maxpages;
4225                                uth.uth_pages += k->uk_pages;
4226                                uth.uth_keg_free += k->uk_free;
4227                                uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
4228                                    * k->uk_ipers;
4229                        }
4230
4231                        /*
4232                         * A zone is secondary is it is not the first entry
4233                         * on the keg's zone list.
4234                         */
4235                        if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4236                            (LIST_FIRST(&kz->uk_zones) != z))
4237                                uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4238
4239                        for (i = 0; i < vm_ndomains; i++) {
4240                                zdom = &z->uz_domain[i];
4241                                uth.uth_zone_free += zdom->uzd_nitems;
4242                        }
4243                        uth.uth_allocs = z->uz_allocs;
4244                        uth.uth_frees = z->uz_frees;
4245                        uth.uth_fails = z->uz_fails;
4246                        uth.uth_sleeps = z->uz_sleeps;
4247                        /*
4248                         * While it is not normally safe to access the cache
4249                         * bucket pointers while not on the CPU that owns the
4250                         * cache, we only allow the pointers to be exchanged
4251                         * without the zone lock held, not invalidated, so
4252                         * accept the possible race associated with bucket
4253                         * exchange during monitoring.
4254                         */
4255                        for (i = 0; i < mp_maxid + 1; i++) {
4256                                bzero(&ups[i], sizeof(*ups));
4257                                if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
4258                                    CPU_ABSENT(i))
4259                                        continue;
4260                                cache = &z->uz_cpu[i];
4261                                if (cache->uc_allocbucket != NULL)
4262                                        ups[i].ups_cache_free +=
4263                                            cache->uc_allocbucket->ub_cnt;
4264                                if (cache->uc_freebucket != NULL)
4265                                        ups[i].ups_cache_free +=
4266                                            cache->uc_freebucket->ub_cnt;
4267                                ups[i].ups_allocs = cache->uc_allocs;
4268                                ups[i].ups_frees = cache->uc_frees;
4269                        }
4270                        ZONE_UNLOCK(z);
4271                        (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4272                        for (i = 0; i < mp_maxid + 1; i++)
4273                                (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4274                }
4275        }
4276        rw_runlock(&uma_rwlock);
4277        error = sbuf_finish(&sbuf);
4278        sbuf_delete(&sbuf);
4279        free(ups, M_TEMP);
4280        return (error);
4281}
4282
4283int
4284sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4285{
4286        uma_zone_t zone = *(uma_zone_t *)arg1;
4287        int error, max;
4288
4289        max = uma_zone_get_max(zone);
4290        error = sysctl_handle_int(oidp, &max, 0, req);
4291        if (error || !req->newptr)
4292                return (error);
4293
4294        uma_zone_set_max(zone, max);
4295
4296        return (0);
4297}
4298
4299int
4300sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4301{
4302        uma_zone_t zone = *(uma_zone_t *)arg1;
4303        int cur;
4304
4305        cur = uma_zone_get_cur(zone);
4306        return (sysctl_handle_int(oidp, &cur, 0, req));
4307}
4308
4309#ifdef INVARIANTS
4310static uma_slab_t
4311uma_dbg_getslab(uma_zone_t zone, void *item)
4312{
4313        uma_slab_t slab;
4314        uma_keg_t keg;
4315        uint8_t *mem;
4316
4317        mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4318        if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4319                slab = vtoslab((vm_offset_t)mem);
4320        } else {
4321                /*
4322                 * It is safe to return the slab here even though the
4323                 * zone is unlocked because the item's allocation state
4324                 * essentially holds a reference.
4325                 */
4326                ZONE_LOCK(zone);
4327                keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
4328                if (keg->uk_flags & UMA_ZONE_HASH)
4329                        slab = hash_sfind(&keg->uk_hash, mem);
4330                else
4331                        slab = (uma_slab_t)(mem + keg->uk_pgoff);
4332                ZONE_UNLOCK(zone);
4333        }
4334
4335        return (slab);
4336}
4337
4338static bool
4339uma_dbg_zskip(uma_zone_t zone, void *mem)
4340{
4341        uma_keg_t keg;
4342
4343        if ((keg = zone_first_keg(zone)) == NULL)
4344                return (true);
4345
4346        return (uma_dbg_kskip(keg, mem));
4347}
4348
4349static bool
4350uma_dbg_kskip(uma_keg_t keg, void *mem)
4351{
4352        uintptr_t idx;
4353
4354        if (dbg_divisor == 0)
4355                return (true);
4356
4357        if (dbg_divisor == 1)
4358                return (false);
4359
4360        idx = (uintptr_t)mem >> PAGE_SHIFT;
4361        if (keg->uk_ipers > 1) {
4362                idx *= keg->uk_ipers;
4363                idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4364        }
4365
4366        if ((idx / dbg_divisor) * dbg_divisor != idx) {
4367                counter_u64_add(uma_skip_cnt, 1);
4368                return (true);
4369        }
4370        counter_u64_add(uma_dbg_cnt, 1);
4371
4372        return (false);
4373}
4374
4375/*
4376 * Set up the slab's freei data such that uma_dbg_free can function.
4377 *
4378 */
4379static void
4380uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4381{
4382        uma_keg_t keg;
4383        int freei;
4384
4385        if (slab == NULL) {
4386                slab = uma_dbg_getslab(zone, item);
4387                if (slab == NULL)
4388                        panic("uma: item %p did not belong to zone %s\n",
4389                            item, zone->uz_name);
4390        }
4391        keg = slab->us_keg;
4392        freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4393
4394        if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4395                panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4396                    item, zone, zone->uz_name, slab, freei);
4397        BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4398
4399        return;
4400}
4401
4402/*
4403 * Verifies freed addresses.  Checks for alignment, valid slab membership
4404 * and duplicate frees.
4405 *
4406 */
4407static void
4408uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4409{
4410        uma_keg_t keg;
4411        int freei;
4412
4413        if (slab == NULL) {
4414                slab = uma_dbg_getslab(zone, item);
4415                if (slab == NULL)
4416                        panic("uma: Freed item %p did not belong to zone %s\n",
4417                            item, zone->uz_name);
4418        }
4419        keg = slab->us_keg;
4420        freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4421
4422        if (freei >= keg->uk_ipers)
4423                panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4424                    item, zone, zone->uz_name, slab, freei);
4425
4426        if (((freei * keg->uk_rsize) + slab->us_data) != item)
4427                panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4428                    item, zone, zone->uz_name, slab, freei);
4429
4430        if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4431                panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4432                    item, zone, zone->uz_name, slab, freei);
4433
4434        BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4435}
4436#endif /* INVARIANTS */
4437
4438#ifndef __rtems__
4439#ifdef DDB
4440DB_SHOW_COMMAND(uma, db_show_uma)
4441{
4442        uma_keg_t kz;
4443        uma_zone_t z;
4444        uint64_t allocs, frees, sleeps;
4445        long cachefree;
4446        int i;
4447
4448        db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
4449            "Free", "Requests", "Sleeps", "Bucket");
4450        LIST_FOREACH(kz, &uma_kegs, uk_link) {
4451                LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4452                        if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4453                                allocs = z->uz_allocs;
4454                                frees = z->uz_frees;
4455                                sleeps = z->uz_sleeps;
4456                                cachefree = 0;
4457                        } else
4458                                uma_zone_sumstat(z, &cachefree, &allocs,
4459                                    &frees, &sleeps);
4460                        if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4461                            (LIST_FIRST(&kz->uk_zones) != z)))
4462                                cachefree += kz->uk_free;
4463                        for (i = 0; i < vm_ndomains; i++)
4464                                cachefree += z->uz_domain[i].uzd_nitems;
4465
4466                        db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u\n",
4467                            z->uz_name, (uintmax_t)kz->uk_size,
4468                            (intmax_t)(allocs - frees), cachefree,
4469                            (uintmax_t)allocs, sleeps, z->uz_count);
4470                        if (db_pager_quit)
4471                                return;
4472                }
4473        }
4474}
4475
4476DB_SHOW_COMMAND(umacache, db_show_umacache)
4477{
4478        uma_zone_t z;
4479        uint64_t allocs, frees;
4480        long cachefree;
4481        int i;
4482
4483        db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4484            "Requests", "Bucket");
4485        LIST_FOREACH(z, &uma_cachezones, uz_link) {
4486                uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
4487                for (i = 0; i < vm_ndomains; i++)
4488                        cachefree += z->uz_domain[i].uzd_nitems;
4489                db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4490                    z->uz_name, (uintmax_t)z->uz_size,
4491                    (intmax_t)(allocs - frees), cachefree,
4492                    (uintmax_t)allocs, z->uz_count);
4493                if (db_pager_quit)
4494                        return;
4495        }
4496}
4497#endif  /* DDB */
4498#endif /* __rtems__ */
4499#ifdef __rtems__
4500static void
4501rtems_bsd_uma_startup(void *unused)
4502{
4503        (void) unused;
4504
4505        uma_kmem_limit = rtems_bsd_get_allocator_domain_size(
4506            RTEMS_BSD_ALLOCATOR_DOMAIN_PAGE);
4507        sx_init_flags(&uma_drain_lock, "umadrain", SX_RECURSE);
4508        uma_startup(NULL, 0);
4509}
4510
4511SYSINIT(rtems_bsd_uma_startup, SI_SUB_VM, SI_ORDER_SECOND,
4512    rtems_bsd_uma_startup, NULL);
4513
4514/*
4515 * This is a helper routine for test programs.  The uma_timeout() may need some
4516 * dynamic memory.  This could disturb out of memory tests.
4517 */
4518void
4519rtems_uma_drain_timeout(void)
4520{
4521
4522        callout_drain(&uma_callout);
4523}
4524#endif /* __rtems__ */
Note: See TracBrowser for help on using the repository browser.