1 | #include <machine/rtems-bsd-kernel-space.h> |
---|
2 | |
---|
3 | /*- |
---|
4 | * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa |
---|
5 | * All rights reserved |
---|
6 | * |
---|
7 | * Redistribution and use in source and binary forms, with or without |
---|
8 | * modification, are permitted provided that the following conditions |
---|
9 | * are met: |
---|
10 | * 1. Redistributions of source code must retain the above copyright |
---|
11 | * notice, this list of conditions and the following disclaimer. |
---|
12 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
13 | * notice, this list of conditions and the following disclaimer in the |
---|
14 | * documentation and/or other materials provided with the distribution. |
---|
15 | * |
---|
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
---|
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
---|
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
26 | * SUCH DAMAGE. |
---|
27 | */ |
---|
28 | |
---|
29 | /* |
---|
30 | * Dummynet portions related to packet handling. |
---|
31 | */ |
---|
32 | #include <sys/cdefs.h> |
---|
33 | __FBSDID("$FreeBSD$"); |
---|
34 | |
---|
35 | #include <rtems/bsd/local/opt_inet6.h> |
---|
36 | |
---|
37 | #include <rtems/bsd/sys/param.h> |
---|
38 | #include <sys/systm.h> |
---|
39 | #include <sys/malloc.h> |
---|
40 | #include <sys/mbuf.h> |
---|
41 | #include <sys/kernel.h> |
---|
42 | #include <rtems/bsd/sys/lock.h> |
---|
43 | #include <sys/module.h> |
---|
44 | #include <sys/priv.h> |
---|
45 | #include <sys/proc.h> |
---|
46 | #include <sys/rwlock.h> |
---|
47 | #include <sys/socket.h> |
---|
48 | #include <rtems/bsd/sys/time.h> |
---|
49 | #include <sys/sysctl.h> |
---|
50 | |
---|
51 | #include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ |
---|
52 | #include <net/netisr.h> |
---|
53 | #include <net/vnet.h> |
---|
54 | |
---|
55 | #include <netinet/in.h> |
---|
56 | #include <netinet/ip.h> /* ip_len, ip_off */ |
---|
57 | #include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ |
---|
58 | #include <netinet/ip_fw.h> |
---|
59 | #include <netinet/ip_dummynet.h> |
---|
60 | #include <netinet/if_ether.h> /* various ether_* routines */ |
---|
61 | #include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ |
---|
62 | #include <netinet6/ip6_var.h> |
---|
63 | |
---|
64 | #include <netpfil/ipfw/ip_fw_private.h> |
---|
65 | #include <netpfil/ipfw/dn_heap.h> |
---|
66 | #include <netpfil/ipfw/ip_dn_private.h> |
---|
67 | #include <netpfil/ipfw/dn_sched.h> |
---|
68 | |
---|
69 | /* |
---|
70 | * We keep a private variable for the simulation time, but we could |
---|
71 | * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) |
---|
72 | * instead of dn_cfg.curr_time |
---|
73 | */ |
---|
74 | |
---|
75 | struct dn_parms dn_cfg; |
---|
76 | //VNET_DEFINE(struct dn_parms, _base_dn_cfg); |
---|
77 | |
---|
78 | static long tick_last; /* Last tick duration (usec). */ |
---|
79 | static long tick_delta; /* Last vs standard tick diff (usec). */ |
---|
80 | static long tick_delta_sum; /* Accumulated tick difference (usec).*/ |
---|
81 | static long tick_adjustment; /* Tick adjustments done. */ |
---|
82 | static long tick_lost; /* Lost(coalesced) ticks number. */ |
---|
83 | /* Adjusted vs non-adjusted curr_time difference (ticks). */ |
---|
84 | static long tick_diff; |
---|
85 | |
---|
86 | static unsigned long io_pkt; |
---|
87 | static unsigned long io_pkt_fast; |
---|
88 | static unsigned long io_pkt_drop; |
---|
89 | |
---|
90 | /* |
---|
91 | * We use a heap to store entities for which we have pending timer events. |
---|
92 | * The heap is checked at every tick and all entities with expired events |
---|
93 | * are extracted. |
---|
94 | */ |
---|
95 | |
---|
96 | MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); |
---|
97 | |
---|
98 | extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); |
---|
99 | |
---|
100 | #ifdef SYSCTL_NODE |
---|
101 | |
---|
102 | SYSBEGIN(f4) |
---|
103 | |
---|
104 | SYSCTL_DECL(_net_inet); |
---|
105 | SYSCTL_DECL(_net_inet_ip); |
---|
106 | static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); |
---|
107 | |
---|
108 | /* wrapper to pass dn_cfg fields to SYSCTL_* */ |
---|
109 | //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) |
---|
110 | #define DC(x) (&(dn_cfg.x)) |
---|
111 | /* parameters */ |
---|
112 | |
---|
113 | static int |
---|
114 | sysctl_hash_size(SYSCTL_HANDLER_ARGS) |
---|
115 | { |
---|
116 | int error, value; |
---|
117 | |
---|
118 | value = dn_cfg.hash_size; |
---|
119 | error = sysctl_handle_int(oidp, &value, 0, req); |
---|
120 | if (error != 0 || req->newptr == NULL) |
---|
121 | return (error); |
---|
122 | if (value < 16 || value > 65536) |
---|
123 | return (EINVAL); |
---|
124 | dn_cfg.hash_size = value; |
---|
125 | return (0); |
---|
126 | } |
---|
127 | |
---|
128 | SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, |
---|
129 | CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, |
---|
130 | "I", "Default hash table size"); |
---|
131 | |
---|
132 | static int |
---|
133 | sysctl_limits(SYSCTL_HANDLER_ARGS) |
---|
134 | { |
---|
135 | int error; |
---|
136 | long value; |
---|
137 | |
---|
138 | if (arg2 != 0) |
---|
139 | value = dn_cfg.slot_limit; |
---|
140 | else |
---|
141 | value = dn_cfg.byte_limit; |
---|
142 | error = sysctl_handle_long(oidp, &value, 0, req); |
---|
143 | |
---|
144 | if (error != 0 || req->newptr == NULL) |
---|
145 | return (error); |
---|
146 | if (arg2 != 0) { |
---|
147 | if (value < 1) |
---|
148 | return (EINVAL); |
---|
149 | dn_cfg.slot_limit = value; |
---|
150 | } else { |
---|
151 | if (value < 1500) |
---|
152 | return (EINVAL); |
---|
153 | dn_cfg.byte_limit = value; |
---|
154 | } |
---|
155 | return (0); |
---|
156 | } |
---|
157 | |
---|
158 | SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, |
---|
159 | CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, |
---|
160 | "L", "Upper limit in slots for pipe queue."); |
---|
161 | SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, |
---|
162 | CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, |
---|
163 | "L", "Upper limit in bytes for pipe queue."); |
---|
164 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, |
---|
165 | CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); |
---|
166 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, |
---|
167 | CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); |
---|
168 | |
---|
169 | /* RED parameters */ |
---|
170 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, |
---|
171 | CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); |
---|
172 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, |
---|
173 | CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); |
---|
174 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, |
---|
175 | CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); |
---|
176 | |
---|
177 | /* time adjustment */ |
---|
178 | SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, |
---|
179 | CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); |
---|
180 | SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, |
---|
181 | CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); |
---|
182 | SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, |
---|
183 | CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); |
---|
184 | SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, |
---|
185 | CTLFLAG_RD, &tick_diff, 0, |
---|
186 | "Adjusted vs non-adjusted curr_time difference (ticks)."); |
---|
187 | SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, |
---|
188 | CTLFLAG_RD, &tick_lost, 0, |
---|
189 | "Number of ticks coalesced by dummynet taskqueue."); |
---|
190 | |
---|
191 | /* Drain parameters */ |
---|
192 | SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, |
---|
193 | CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); |
---|
194 | SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, |
---|
195 | CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); |
---|
196 | |
---|
197 | /* statistics */ |
---|
198 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, |
---|
199 | CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); |
---|
200 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, |
---|
201 | CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); |
---|
202 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, |
---|
203 | CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); |
---|
204 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, |
---|
205 | CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); |
---|
206 | SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, |
---|
207 | CTLFLAG_RD, &io_pkt, 0, |
---|
208 | "Number of packets passed to dummynet."); |
---|
209 | SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, |
---|
210 | CTLFLAG_RD, &io_pkt_fast, 0, |
---|
211 | "Number of packets bypassed dummynet scheduler."); |
---|
212 | SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, |
---|
213 | CTLFLAG_RD, &io_pkt_drop, 0, |
---|
214 | "Number of packets dropped by dummynet."); |
---|
215 | #undef DC |
---|
216 | SYSEND |
---|
217 | |
---|
218 | #endif |
---|
219 | |
---|
220 | static void dummynet_send(struct mbuf *); |
---|
221 | |
---|
222 | /* |
---|
223 | * Packets processed by dummynet have an mbuf tag associated with |
---|
224 | * them that carries their dummynet state. |
---|
225 | * Outside dummynet, only the 'rule' field is relevant, and it must |
---|
226 | * be at the beginning of the structure. |
---|
227 | */ |
---|
228 | struct dn_pkt_tag { |
---|
229 | struct ipfw_rule_ref rule; /* matching rule */ |
---|
230 | |
---|
231 | /* second part, dummynet specific */ |
---|
232 | int dn_dir; /* action when packet comes out.*/ |
---|
233 | /* see ip_fw_private.h */ |
---|
234 | uint64_t output_time; /* when the pkt is due for delivery*/ |
---|
235 | struct ifnet *ifp; /* interface, for ip_output */ |
---|
236 | struct _ip6dn_args ip6opt; /* XXX ipv6 options */ |
---|
237 | }; |
---|
238 | |
---|
239 | /* |
---|
240 | * Return the mbuf tag holding the dummynet state (it should |
---|
241 | * be the first one on the list). |
---|
242 | */ |
---|
243 | static struct dn_pkt_tag * |
---|
244 | dn_tag_get(struct mbuf *m) |
---|
245 | { |
---|
246 | struct m_tag *mtag = m_tag_first(m); |
---|
247 | KASSERT(mtag != NULL && |
---|
248 | mtag->m_tag_cookie == MTAG_ABI_COMPAT && |
---|
249 | mtag->m_tag_id == PACKET_TAG_DUMMYNET, |
---|
250 | ("packet on dummynet queue w/o dummynet tag!")); |
---|
251 | return (struct dn_pkt_tag *)(mtag+1); |
---|
252 | } |
---|
253 | |
---|
254 | static inline void |
---|
255 | mq_append(struct mq *q, struct mbuf *m) |
---|
256 | { |
---|
257 | if (q->head == NULL) |
---|
258 | q->head = m; |
---|
259 | else |
---|
260 | q->tail->m_nextpkt = m; |
---|
261 | q->tail = m; |
---|
262 | m->m_nextpkt = NULL; |
---|
263 | } |
---|
264 | |
---|
265 | /* |
---|
266 | * Dispose a list of packet. Use a functions so if we need to do |
---|
267 | * more work, this is a central point to do it. |
---|
268 | */ |
---|
269 | void dn_free_pkts(struct mbuf *mnext) |
---|
270 | { |
---|
271 | struct mbuf *m; |
---|
272 | |
---|
273 | while ((m = mnext) != NULL) { |
---|
274 | mnext = m->m_nextpkt; |
---|
275 | FREE_PKT(m); |
---|
276 | } |
---|
277 | } |
---|
278 | |
---|
279 | static int |
---|
280 | red_drops (struct dn_queue *q, int len) |
---|
281 | { |
---|
282 | /* |
---|
283 | * RED algorithm |
---|
284 | * |
---|
285 | * RED calculates the average queue size (avg) using a low-pass filter |
---|
286 | * with an exponential weighted (w_q) moving average: |
---|
287 | * avg <- (1-w_q) * avg + w_q * q_size |
---|
288 | * where q_size is the queue length (measured in bytes or * packets). |
---|
289 | * |
---|
290 | * If q_size == 0, we compute the idle time for the link, and set |
---|
291 | * avg = (1 - w_q)^(idle/s) |
---|
292 | * where s is the time needed for transmitting a medium-sized packet. |
---|
293 | * |
---|
294 | * Now, if avg < min_th the packet is enqueued. |
---|
295 | * If avg > max_th the packet is dropped. Otherwise, the packet is |
---|
296 | * dropped with probability P function of avg. |
---|
297 | */ |
---|
298 | |
---|
299 | struct dn_fsk *fs = q->fs; |
---|
300 | int64_t p_b = 0; |
---|
301 | |
---|
302 | /* Queue in bytes or packets? */ |
---|
303 | uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? |
---|
304 | q->ni.len_bytes : q->ni.length; |
---|
305 | |
---|
306 | /* Average queue size estimation. */ |
---|
307 | if (q_size != 0) { |
---|
308 | /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ |
---|
309 | int diff = SCALE(q_size) - q->avg; |
---|
310 | int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); |
---|
311 | |
---|
312 | q->avg += (int)v; |
---|
313 | } else { |
---|
314 | /* |
---|
315 | * Queue is empty, find for how long the queue has been |
---|
316 | * empty and use a lookup table for computing |
---|
317 | * (1 - * w_q)^(idle_time/s) where s is the time to send a |
---|
318 | * (small) packet. |
---|
319 | * XXX check wraps... |
---|
320 | */ |
---|
321 | if (q->avg) { |
---|
322 | u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); |
---|
323 | |
---|
324 | q->avg = (t < fs->lookup_depth) ? |
---|
325 | SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; |
---|
326 | } |
---|
327 | } |
---|
328 | |
---|
329 | /* Should i drop? */ |
---|
330 | if (q->avg < fs->min_th) { |
---|
331 | q->count = -1; |
---|
332 | return (0); /* accept packet */ |
---|
333 | } |
---|
334 | if (q->avg >= fs->max_th) { /* average queue >= max threshold */ |
---|
335 | if (fs->fs.flags & DN_IS_GENTLE_RED) { |
---|
336 | /* |
---|
337 | * According to Gentle-RED, if avg is greater than |
---|
338 | * max_th the packet is dropped with a probability |
---|
339 | * p_b = c_3 * avg - c_4 |
---|
340 | * where c_3 = (1 - max_p) / max_th |
---|
341 | * c_4 = 1 - 2 * max_p |
---|
342 | */ |
---|
343 | p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - |
---|
344 | fs->c_4; |
---|
345 | } else { |
---|
346 | q->count = -1; |
---|
347 | return (1); |
---|
348 | } |
---|
349 | } else if (q->avg > fs->min_th) { |
---|
350 | /* |
---|
351 | * We compute p_b using the linear dropping function |
---|
352 | * p_b = c_1 * avg - c_2 |
---|
353 | * where c_1 = max_p / (max_th - min_th) |
---|
354 | * c_2 = max_p * min_th / (max_th - min_th) |
---|
355 | */ |
---|
356 | p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; |
---|
357 | } |
---|
358 | |
---|
359 | if (fs->fs.flags & DN_QSIZE_BYTES) |
---|
360 | p_b = div64((p_b * len) , fs->max_pkt_size); |
---|
361 | if (++q->count == 0) |
---|
362 | q->random = random() & 0xffff; |
---|
363 | else { |
---|
364 | /* |
---|
365 | * q->count counts packets arrived since last drop, so a greater |
---|
366 | * value of q->count means a greater packet drop probability. |
---|
367 | */ |
---|
368 | if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { |
---|
369 | q->count = 0; |
---|
370 | /* After a drop we calculate a new random value. */ |
---|
371 | q->random = random() & 0xffff; |
---|
372 | return (1); /* drop */ |
---|
373 | } |
---|
374 | } |
---|
375 | /* End of RED algorithm. */ |
---|
376 | |
---|
377 | return (0); /* accept */ |
---|
378 | |
---|
379 | } |
---|
380 | |
---|
381 | /* |
---|
382 | * Enqueue a packet in q, subject to space and queue management policy |
---|
383 | * (whose parameters are in q->fs). |
---|
384 | * Update stats for the queue and the scheduler. |
---|
385 | * Return 0 on success, 1 on drop. The packet is consumed anyways. |
---|
386 | */ |
---|
387 | int |
---|
388 | dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) |
---|
389 | { |
---|
390 | struct dn_fs *f; |
---|
391 | struct dn_flow *ni; /* stats for scheduler instance */ |
---|
392 | uint64_t len; |
---|
393 | |
---|
394 | if (q->fs == NULL || q->_si == NULL) { |
---|
395 | printf("%s fs %p si %p, dropping\n", |
---|
396 | __FUNCTION__, q->fs, q->_si); |
---|
397 | FREE_PKT(m); |
---|
398 | return 1; |
---|
399 | } |
---|
400 | f = &(q->fs->fs); |
---|
401 | ni = &q->_si->ni; |
---|
402 | len = m->m_pkthdr.len; |
---|
403 | /* Update statistics, then check reasons to drop pkt. */ |
---|
404 | q->ni.tot_bytes += len; |
---|
405 | q->ni.tot_pkts++; |
---|
406 | ni->tot_bytes += len; |
---|
407 | ni->tot_pkts++; |
---|
408 | if (drop) |
---|
409 | goto drop; |
---|
410 | if (f->plr && random() < f->plr) |
---|
411 | goto drop; |
---|
412 | if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) |
---|
413 | goto drop; |
---|
414 | if (f->flags & DN_QSIZE_BYTES) { |
---|
415 | if (q->ni.len_bytes > f->qsize) |
---|
416 | goto drop; |
---|
417 | } else if (q->ni.length >= f->qsize) { |
---|
418 | goto drop; |
---|
419 | } |
---|
420 | mq_append(&q->mq, m); |
---|
421 | q->ni.length++; |
---|
422 | q->ni.len_bytes += len; |
---|
423 | ni->length++; |
---|
424 | ni->len_bytes += len; |
---|
425 | return 0; |
---|
426 | |
---|
427 | drop: |
---|
428 | io_pkt_drop++; |
---|
429 | q->ni.drops++; |
---|
430 | ni->drops++; |
---|
431 | FREE_PKT(m); |
---|
432 | return 1; |
---|
433 | } |
---|
434 | |
---|
435 | /* |
---|
436 | * Fetch packets from the delay line which are due now. If there are |
---|
437 | * leftover packets, reinsert the delay line in the heap. |
---|
438 | * Runs under scheduler lock. |
---|
439 | */ |
---|
440 | static void |
---|
441 | transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) |
---|
442 | { |
---|
443 | struct mbuf *m; |
---|
444 | struct dn_pkt_tag *pkt = NULL; |
---|
445 | |
---|
446 | dline->oid.subtype = 0; /* not in heap */ |
---|
447 | while ((m = dline->mq.head) != NULL) { |
---|
448 | pkt = dn_tag_get(m); |
---|
449 | if (!DN_KEY_LEQ(pkt->output_time, now)) |
---|
450 | break; |
---|
451 | dline->mq.head = m->m_nextpkt; |
---|
452 | mq_append(q, m); |
---|
453 | } |
---|
454 | if (m != NULL) { |
---|
455 | dline->oid.subtype = 1; /* in heap */ |
---|
456 | heap_insert(&dn_cfg.evheap, pkt->output_time, dline); |
---|
457 | } |
---|
458 | } |
---|
459 | |
---|
460 | /* |
---|
461 | * Convert the additional MAC overheads/delays into an equivalent |
---|
462 | * number of bits for the given data rate. The samples are |
---|
463 | * in milliseconds so we need to divide by 1000. |
---|
464 | */ |
---|
465 | static uint64_t |
---|
466 | extra_bits(struct mbuf *m, struct dn_schk *s) |
---|
467 | { |
---|
468 | int index; |
---|
469 | uint64_t bits; |
---|
470 | struct dn_profile *pf = s->profile; |
---|
471 | |
---|
472 | if (!pf || pf->samples_no == 0) |
---|
473 | return 0; |
---|
474 | index = random() % pf->samples_no; |
---|
475 | bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); |
---|
476 | if (index >= pf->loss_level) { |
---|
477 | struct dn_pkt_tag *dt = dn_tag_get(m); |
---|
478 | if (dt) |
---|
479 | dt->dn_dir = DIR_DROP; |
---|
480 | } |
---|
481 | return bits; |
---|
482 | } |
---|
483 | |
---|
484 | /* |
---|
485 | * Send traffic from a scheduler instance due by 'now'. |
---|
486 | * Return a pointer to the head of the queue. |
---|
487 | */ |
---|
488 | static struct mbuf * |
---|
489 | serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) |
---|
490 | { |
---|
491 | struct mq def_q; |
---|
492 | struct dn_schk *s = si->sched; |
---|
493 | struct mbuf *m = NULL; |
---|
494 | int delay_line_idle = (si->dline.mq.head == NULL); |
---|
495 | int done, bw; |
---|
496 | |
---|
497 | if (q == NULL) { |
---|
498 | q = &def_q; |
---|
499 | q->head = NULL; |
---|
500 | } |
---|
501 | |
---|
502 | bw = s->link.bandwidth; |
---|
503 | si->kflags &= ~DN_ACTIVE; |
---|
504 | |
---|
505 | if (bw > 0) |
---|
506 | si->credit += (now - si->sched_time) * bw; |
---|
507 | else |
---|
508 | si->credit = 0; |
---|
509 | si->sched_time = now; |
---|
510 | done = 0; |
---|
511 | while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { |
---|
512 | uint64_t len_scaled; |
---|
513 | |
---|
514 | done++; |
---|
515 | len_scaled = (bw == 0) ? 0 : hz * |
---|
516 | (m->m_pkthdr.len * 8 + extra_bits(m, s)); |
---|
517 | si->credit -= len_scaled; |
---|
518 | /* Move packet in the delay line */ |
---|
519 | dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; |
---|
520 | mq_append(&si->dline.mq, m); |
---|
521 | } |
---|
522 | |
---|
523 | /* |
---|
524 | * If credit >= 0 the instance is idle, mark time. |
---|
525 | * Otherwise put back in the heap, and adjust the output |
---|
526 | * time of the last inserted packet, m, which was too early. |
---|
527 | */ |
---|
528 | if (si->credit >= 0) { |
---|
529 | si->idle_time = now; |
---|
530 | } else { |
---|
531 | uint64_t t; |
---|
532 | KASSERT (bw > 0, ("bw=0 and credit<0 ?")); |
---|
533 | t = div64(bw - 1 - si->credit, bw); |
---|
534 | if (m) |
---|
535 | dn_tag_get(m)->output_time += t; |
---|
536 | si->kflags |= DN_ACTIVE; |
---|
537 | heap_insert(&dn_cfg.evheap, now + t, si); |
---|
538 | } |
---|
539 | if (delay_line_idle && done) |
---|
540 | transmit_event(q, &si->dline, now); |
---|
541 | return q->head; |
---|
542 | } |
---|
543 | |
---|
544 | /* |
---|
545 | * The timer handler for dummynet. Time is computed in ticks, but |
---|
546 | * but the code is tolerant to the actual rate at which this is called. |
---|
547 | * Once complete, the function reschedules itself for the next tick. |
---|
548 | */ |
---|
549 | void |
---|
550 | dummynet_task(void *context, int pending) |
---|
551 | { |
---|
552 | struct timeval t; |
---|
553 | struct mq q = { NULL, NULL }; /* queue to accumulate results */ |
---|
554 | |
---|
555 | CURVNET_SET((struct vnet *)context); |
---|
556 | |
---|
557 | DN_BH_WLOCK(); |
---|
558 | |
---|
559 | /* Update number of lost(coalesced) ticks. */ |
---|
560 | tick_lost += pending - 1; |
---|
561 | |
---|
562 | getmicrouptime(&t); |
---|
563 | /* Last tick duration (usec). */ |
---|
564 | tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + |
---|
565 | (t.tv_usec - dn_cfg.prev_t.tv_usec); |
---|
566 | /* Last tick vs standard tick difference (usec). */ |
---|
567 | tick_delta = (tick_last * hz - 1000000) / hz; |
---|
568 | /* Accumulated tick difference (usec). */ |
---|
569 | tick_delta_sum += tick_delta; |
---|
570 | |
---|
571 | dn_cfg.prev_t = t; |
---|
572 | |
---|
573 | /* |
---|
574 | * Adjust curr_time if the accumulated tick difference is |
---|
575 | * greater than the 'standard' tick. Since curr_time should |
---|
576 | * be monotonically increasing, we do positive adjustments |
---|
577 | * as required, and throttle curr_time in case of negative |
---|
578 | * adjustment. |
---|
579 | */ |
---|
580 | dn_cfg.curr_time++; |
---|
581 | if (tick_delta_sum - tick >= 0) { |
---|
582 | int diff = tick_delta_sum / tick; |
---|
583 | |
---|
584 | dn_cfg.curr_time += diff; |
---|
585 | tick_diff += diff; |
---|
586 | tick_delta_sum %= tick; |
---|
587 | tick_adjustment++; |
---|
588 | } else if (tick_delta_sum + tick <= 0) { |
---|
589 | dn_cfg.curr_time--; |
---|
590 | tick_diff--; |
---|
591 | tick_delta_sum += tick; |
---|
592 | tick_adjustment++; |
---|
593 | } |
---|
594 | |
---|
595 | /* serve pending events, accumulate in q */ |
---|
596 | for (;;) { |
---|
597 | struct dn_id *p; /* generic parameter to handler */ |
---|
598 | |
---|
599 | if (dn_cfg.evheap.elements == 0 || |
---|
600 | DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) |
---|
601 | break; |
---|
602 | p = HEAP_TOP(&dn_cfg.evheap)->object; |
---|
603 | heap_extract(&dn_cfg.evheap, NULL); |
---|
604 | |
---|
605 | if (p->type == DN_SCH_I) { |
---|
606 | serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); |
---|
607 | } else { /* extracted a delay line */ |
---|
608 | transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); |
---|
609 | } |
---|
610 | } |
---|
611 | if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { |
---|
612 | dn_cfg.expire_cycle = 0; |
---|
613 | dn_drain_scheduler(); |
---|
614 | dn_drain_queue(); |
---|
615 | } |
---|
616 | |
---|
617 | DN_BH_WUNLOCK(); |
---|
618 | dn_reschedule(); |
---|
619 | if (q.head != NULL) |
---|
620 | dummynet_send(q.head); |
---|
621 | CURVNET_RESTORE(); |
---|
622 | } |
---|
623 | |
---|
624 | /* |
---|
625 | * forward a chain of packets to the proper destination. |
---|
626 | * This runs outside the dummynet lock. |
---|
627 | */ |
---|
628 | static void |
---|
629 | dummynet_send(struct mbuf *m) |
---|
630 | { |
---|
631 | struct mbuf *n; |
---|
632 | |
---|
633 | for (; m != NULL; m = n) { |
---|
634 | struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ |
---|
635 | struct m_tag *tag; |
---|
636 | int dst; |
---|
637 | |
---|
638 | n = m->m_nextpkt; |
---|
639 | m->m_nextpkt = NULL; |
---|
640 | tag = m_tag_first(m); |
---|
641 | if (tag == NULL) { /* should not happen */ |
---|
642 | dst = DIR_DROP; |
---|
643 | } else { |
---|
644 | struct dn_pkt_tag *pkt = dn_tag_get(m); |
---|
645 | /* extract the dummynet info, rename the tag |
---|
646 | * to carry reinject info. |
---|
647 | */ |
---|
648 | dst = pkt->dn_dir; |
---|
649 | ifp = pkt->ifp; |
---|
650 | tag->m_tag_cookie = MTAG_IPFW_RULE; |
---|
651 | tag->m_tag_id = 0; |
---|
652 | } |
---|
653 | |
---|
654 | switch (dst) { |
---|
655 | case DIR_OUT: |
---|
656 | SET_HOST_IPLEN(mtod(m, struct ip *)); |
---|
657 | ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); |
---|
658 | break ; |
---|
659 | |
---|
660 | case DIR_IN : |
---|
661 | /* put header in network format for ip_input() */ |
---|
662 | //SET_NET_IPLEN(mtod(m, struct ip *)); |
---|
663 | netisr_dispatch(NETISR_IP, m); |
---|
664 | break; |
---|
665 | |
---|
666 | #ifdef INET6 |
---|
667 | case DIR_IN | PROTO_IPV6: |
---|
668 | netisr_dispatch(NETISR_IPV6, m); |
---|
669 | break; |
---|
670 | |
---|
671 | case DIR_OUT | PROTO_IPV6: |
---|
672 | ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); |
---|
673 | break; |
---|
674 | #endif |
---|
675 | |
---|
676 | case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ |
---|
677 | if (bridge_dn_p != NULL) |
---|
678 | ((*bridge_dn_p)(m, ifp)); |
---|
679 | else |
---|
680 | printf("dummynet: if_bridge not loaded\n"); |
---|
681 | |
---|
682 | break; |
---|
683 | |
---|
684 | case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ |
---|
685 | /* |
---|
686 | * The Ethernet code assumes the Ethernet header is |
---|
687 | * contiguous in the first mbuf header. |
---|
688 | * Insure this is true. |
---|
689 | */ |
---|
690 | if (m->m_len < ETHER_HDR_LEN && |
---|
691 | (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { |
---|
692 | printf("dummynet/ether: pullup failed, " |
---|
693 | "dropping packet\n"); |
---|
694 | break; |
---|
695 | } |
---|
696 | ether_demux(m->m_pkthdr.rcvif, m); |
---|
697 | break; |
---|
698 | |
---|
699 | case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ |
---|
700 | ether_output_frame(ifp, m); |
---|
701 | break; |
---|
702 | |
---|
703 | case DIR_DROP: |
---|
704 | /* drop the packet after some time */ |
---|
705 | FREE_PKT(m); |
---|
706 | break; |
---|
707 | |
---|
708 | default: |
---|
709 | printf("dummynet: bad switch %d!\n", dst); |
---|
710 | FREE_PKT(m); |
---|
711 | break; |
---|
712 | } |
---|
713 | } |
---|
714 | } |
---|
715 | |
---|
716 | static inline int |
---|
717 | tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) |
---|
718 | { |
---|
719 | struct dn_pkt_tag *dt; |
---|
720 | struct m_tag *mtag; |
---|
721 | |
---|
722 | mtag = m_tag_get(PACKET_TAG_DUMMYNET, |
---|
723 | sizeof(*dt), M_NOWAIT | M_ZERO); |
---|
724 | if (mtag == NULL) |
---|
725 | return 1; /* Cannot allocate packet header. */ |
---|
726 | m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ |
---|
727 | dt = (struct dn_pkt_tag *)(mtag + 1); |
---|
728 | dt->rule = fwa->rule; |
---|
729 | dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ |
---|
730 | dt->dn_dir = dir; |
---|
731 | dt->ifp = fwa->oif; |
---|
732 | /* dt->output tame is updated as we move through */ |
---|
733 | dt->output_time = dn_cfg.curr_time; |
---|
734 | return 0; |
---|
735 | } |
---|
736 | |
---|
737 | |
---|
738 | /* |
---|
739 | * dummynet hook for packets. |
---|
740 | * We use the argument to locate the flowset fs and the sched_set sch |
---|
741 | * associated to it. The we apply flow_mask and sched_mask to |
---|
742 | * determine the queue and scheduler instances. |
---|
743 | * |
---|
744 | * dir where shall we send the packet after dummynet. |
---|
745 | * *m0 the mbuf with the packet |
---|
746 | * ifp the 'ifp' parameter from the caller. |
---|
747 | * NULL in ip_input, destination interface in ip_output, |
---|
748 | */ |
---|
749 | int |
---|
750 | dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) |
---|
751 | { |
---|
752 | struct mbuf *m = *m0; |
---|
753 | struct dn_fsk *fs = NULL; |
---|
754 | struct dn_sch_inst *si; |
---|
755 | struct dn_queue *q = NULL; /* default */ |
---|
756 | |
---|
757 | int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + |
---|
758 | ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); |
---|
759 | DN_BH_WLOCK(); |
---|
760 | io_pkt++; |
---|
761 | /* we could actually tag outside the lock, but who cares... */ |
---|
762 | if (tag_mbuf(m, dir, fwa)) |
---|
763 | goto dropit; |
---|
764 | if (dn_cfg.busy) { |
---|
765 | /* if the upper half is busy doing something expensive, |
---|
766 | * lets queue the packet and move forward |
---|
767 | */ |
---|
768 | mq_append(&dn_cfg.pending, m); |
---|
769 | m = *m0 = NULL; /* consumed */ |
---|
770 | goto done; /* already active, nothing to do */ |
---|
771 | } |
---|
772 | /* XXX locate_flowset could be optimised with a direct ref. */ |
---|
773 | fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); |
---|
774 | if (fs == NULL) |
---|
775 | goto dropit; /* This queue/pipe does not exist! */ |
---|
776 | if (fs->sched == NULL) /* should not happen */ |
---|
777 | goto dropit; |
---|
778 | /* find scheduler instance, possibly applying sched_mask */ |
---|
779 | si = ipdn_si_find(fs->sched, &(fwa->f_id)); |
---|
780 | if (si == NULL) |
---|
781 | goto dropit; |
---|
782 | /* |
---|
783 | * If the scheduler supports multiple queues, find the right one |
---|
784 | * (otherwise it will be ignored by enqueue). |
---|
785 | */ |
---|
786 | if (fs->sched->fp->flags & DN_MULTIQUEUE) { |
---|
787 | q = ipdn_q_find(fs, si, &(fwa->f_id)); |
---|
788 | if (q == NULL) |
---|
789 | goto dropit; |
---|
790 | } |
---|
791 | if (fs->sched->fp->enqueue(si, q, m)) { |
---|
792 | /* packet was dropped by enqueue() */ |
---|
793 | m = *m0 = NULL; |
---|
794 | goto dropit; |
---|
795 | } |
---|
796 | |
---|
797 | if (si->kflags & DN_ACTIVE) { |
---|
798 | m = *m0 = NULL; /* consumed */ |
---|
799 | goto done; /* already active, nothing to do */ |
---|
800 | } |
---|
801 | |
---|
802 | /* compute the initial allowance */ |
---|
803 | if (si->idle_time < dn_cfg.curr_time) { |
---|
804 | /* Do this only on the first packet on an idle pipe */ |
---|
805 | struct dn_link *p = &fs->sched->link; |
---|
806 | |
---|
807 | si->sched_time = dn_cfg.curr_time; |
---|
808 | si->credit = dn_cfg.io_fast ? p->bandwidth : 0; |
---|
809 | if (p->burst) { |
---|
810 | uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; |
---|
811 | if (burst > p->burst) |
---|
812 | burst = p->burst; |
---|
813 | si->credit += burst; |
---|
814 | } |
---|
815 | } |
---|
816 | /* pass through scheduler and delay line */ |
---|
817 | m = serve_sched(NULL, si, dn_cfg.curr_time); |
---|
818 | |
---|
819 | /* optimization -- pass it back to ipfw for immediate send */ |
---|
820 | /* XXX Don't call dummynet_send() if scheduler return the packet |
---|
821 | * just enqueued. This avoid a lock order reversal. |
---|
822 | * |
---|
823 | */ |
---|
824 | if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { |
---|
825 | /* fast io, rename the tag * to carry reinject info. */ |
---|
826 | struct m_tag *tag = m_tag_first(m); |
---|
827 | |
---|
828 | tag->m_tag_cookie = MTAG_IPFW_RULE; |
---|
829 | tag->m_tag_id = 0; |
---|
830 | io_pkt_fast++; |
---|
831 | if (m->m_nextpkt != NULL) { |
---|
832 | printf("dummynet: fast io: pkt chain detected!\n"); |
---|
833 | m->m_nextpkt = NULL; |
---|
834 | } |
---|
835 | m = NULL; |
---|
836 | } else { |
---|
837 | *m0 = NULL; |
---|
838 | } |
---|
839 | done: |
---|
840 | DN_BH_WUNLOCK(); |
---|
841 | if (m) |
---|
842 | dummynet_send(m); |
---|
843 | return 0; |
---|
844 | |
---|
845 | dropit: |
---|
846 | io_pkt_drop++; |
---|
847 | DN_BH_WUNLOCK(); |
---|
848 | if (m) |
---|
849 | FREE_PKT(m); |
---|
850 | *m0 = NULL; |
---|
851 | return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; |
---|
852 | } |
---|