1 | #include <machine/rtems-bsd-config.h> |
---|
2 | |
---|
3 | /*- |
---|
4 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 |
---|
5 | * The Regents of the University of California. All rights reserved. |
---|
6 | * |
---|
7 | * Redistribution and use in source and binary forms, with or without |
---|
8 | * modification, are permitted provided that the following conditions |
---|
9 | * are met: |
---|
10 | * 1. Redistributions of source code must retain the above copyright |
---|
11 | * notice, this list of conditions and the following disclaimer. |
---|
12 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
13 | * notice, this list of conditions and the following disclaimer in the |
---|
14 | * documentation and/or other materials provided with the distribution. |
---|
15 | * 4. Neither the name of the University nor the names of its contributors |
---|
16 | * may be used to endorse or promote products derived from this software |
---|
17 | * without specific prior written permission. |
---|
18 | * |
---|
19 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
---|
20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
---|
23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
29 | * SUCH DAMAGE. |
---|
30 | * |
---|
31 | * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 |
---|
32 | */ |
---|
33 | |
---|
34 | #include <sys/cdefs.h> |
---|
35 | __FBSDID("$FreeBSD$"); |
---|
36 | |
---|
37 | #include <rtems/bsd/local/opt_compat.h> |
---|
38 | #include <rtems/bsd/local/opt_inet.h> |
---|
39 | #include <rtems/bsd/local/opt_inet6.h> |
---|
40 | #include <rtems/bsd/local/opt_ipsec.h> |
---|
41 | #include <rtems/bsd/local/opt_tcpdebug.h> |
---|
42 | |
---|
43 | #include <rtems/bsd/sys/param.h> |
---|
44 | #include <sys/systm.h> |
---|
45 | #include <sys/callout.h> |
---|
46 | #include <sys/kernel.h> |
---|
47 | #include <sys/sysctl.h> |
---|
48 | #include <sys/jail.h> |
---|
49 | #include <sys/malloc.h> |
---|
50 | #include <sys/mbuf.h> |
---|
51 | #ifdef INET6 |
---|
52 | #include <sys/domain.h> |
---|
53 | #endif |
---|
54 | #include <sys/priv.h> |
---|
55 | #include <sys/proc.h> |
---|
56 | #include <sys/socket.h> |
---|
57 | #include <sys/socketvar.h> |
---|
58 | #include <sys/protosw.h> |
---|
59 | #include <sys/random.h> |
---|
60 | |
---|
61 | #include <vm/uma.h> |
---|
62 | |
---|
63 | #include <net/route.h> |
---|
64 | #include <net/if.h> |
---|
65 | #include <net/vnet.h> |
---|
66 | |
---|
67 | #include <netinet/in.h> |
---|
68 | #include <netinet/in_systm.h> |
---|
69 | #include <netinet/ip.h> |
---|
70 | #ifdef INET6 |
---|
71 | #include <netinet/ip6.h> |
---|
72 | #endif |
---|
73 | #include <netinet/in_pcb.h> |
---|
74 | #ifdef INET6 |
---|
75 | #include <netinet6/in6_pcb.h> |
---|
76 | #endif |
---|
77 | #include <netinet/in_var.h> |
---|
78 | #include <netinet/ip_var.h> |
---|
79 | #ifdef INET6 |
---|
80 | #include <netinet6/ip6_var.h> |
---|
81 | #include <netinet6/scope6_var.h> |
---|
82 | #include <netinet6/nd6.h> |
---|
83 | #endif |
---|
84 | #include <netinet/ip_icmp.h> |
---|
85 | #include <netinet/tcp.h> |
---|
86 | #include <netinet/tcp_fsm.h> |
---|
87 | #include <netinet/tcp_seq.h> |
---|
88 | #include <netinet/tcp_timer.h> |
---|
89 | #include <netinet/tcp_var.h> |
---|
90 | #include <netinet/tcp_syncache.h> |
---|
91 | #include <netinet/tcp_offload.h> |
---|
92 | #ifdef INET6 |
---|
93 | #include <netinet6/tcp6_var.h> |
---|
94 | #endif |
---|
95 | #include <netinet/tcpip.h> |
---|
96 | #ifdef TCPDEBUG |
---|
97 | #include <netinet/tcp_debug.h> |
---|
98 | #endif |
---|
99 | #include <netinet6/ip6protosw.h> |
---|
100 | |
---|
101 | #ifdef IPSEC |
---|
102 | #include <netipsec/ipsec.h> |
---|
103 | #include <netipsec/xform.h> |
---|
104 | #ifdef INET6 |
---|
105 | #include <netipsec/ipsec6.h> |
---|
106 | #endif |
---|
107 | #include <netipsec/key.h> |
---|
108 | #include <sys/syslog.h> |
---|
109 | #endif /*IPSEC*/ |
---|
110 | |
---|
111 | #include <machine/in_cksum.h> |
---|
112 | #include <sys/md5.h> |
---|
113 | |
---|
114 | #include <security/mac/mac_framework.h> |
---|
115 | |
---|
116 | VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; |
---|
117 | #ifdef INET6 |
---|
118 | VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; |
---|
119 | #endif |
---|
120 | |
---|
121 | static int |
---|
122 | sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) |
---|
123 | { |
---|
124 | int error, new; |
---|
125 | |
---|
126 | new = V_tcp_mssdflt; |
---|
127 | error = sysctl_handle_int(oidp, &new, 0, req); |
---|
128 | if (error == 0 && req->newptr) { |
---|
129 | if (new < TCP_MINMSS) |
---|
130 | error = EINVAL; |
---|
131 | else |
---|
132 | V_tcp_mssdflt = new; |
---|
133 | } |
---|
134 | return (error); |
---|
135 | } |
---|
136 | |
---|
137 | SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, |
---|
138 | CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, |
---|
139 | &sysctl_net_inet_tcp_mss_check, "I", |
---|
140 | "Default TCP Maximum Segment Size"); |
---|
141 | |
---|
142 | #ifdef INET6 |
---|
143 | static int |
---|
144 | sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) |
---|
145 | { |
---|
146 | int error, new; |
---|
147 | |
---|
148 | new = V_tcp_v6mssdflt; |
---|
149 | error = sysctl_handle_int(oidp, &new, 0, req); |
---|
150 | if (error == 0 && req->newptr) { |
---|
151 | if (new < TCP_MINMSS) |
---|
152 | error = EINVAL; |
---|
153 | else |
---|
154 | V_tcp_v6mssdflt = new; |
---|
155 | } |
---|
156 | return (error); |
---|
157 | } |
---|
158 | |
---|
159 | SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, |
---|
160 | CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, |
---|
161 | &sysctl_net_inet_tcp_mss_v6_check, "I", |
---|
162 | "Default TCP Maximum Segment Size for IPv6"); |
---|
163 | #endif |
---|
164 | |
---|
165 | static int |
---|
166 | vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) |
---|
167 | { |
---|
168 | |
---|
169 | VNET_SYSCTL_ARG(req, arg1); |
---|
170 | return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); |
---|
171 | } |
---|
172 | |
---|
173 | /* |
---|
174 | * Minimum MSS we accept and use. This prevents DoS attacks where |
---|
175 | * we are forced to a ridiculous low MSS like 20 and send hundreds |
---|
176 | * of packets instead of one. The effect scales with the available |
---|
177 | * bandwidth and quickly saturates the CPU and network interface |
---|
178 | * with packet generation and sending. Set to zero to disable MINMSS |
---|
179 | * checking. This setting prevents us from sending too small packets. |
---|
180 | */ |
---|
181 | VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; |
---|
182 | SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, |
---|
183 | &VNET_NAME(tcp_minmss), 0, |
---|
184 | "Minmum TCP Maximum Segment Size"); |
---|
185 | |
---|
186 | VNET_DEFINE(int, tcp_do_rfc1323) = 1; |
---|
187 | SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, |
---|
188 | &VNET_NAME(tcp_do_rfc1323), 0, |
---|
189 | "Enable rfc1323 (high performance TCP) extensions"); |
---|
190 | |
---|
191 | static int tcp_log_debug = 0; |
---|
192 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, |
---|
193 | &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); |
---|
194 | |
---|
195 | static int tcp_tcbhashsize = 0; |
---|
196 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, |
---|
197 | &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); |
---|
198 | |
---|
199 | static int do_tcpdrain = 1; |
---|
200 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, |
---|
201 | "Enable tcp_drain routine for extra help when low on mbufs"); |
---|
202 | |
---|
203 | SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, |
---|
204 | &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); |
---|
205 | |
---|
206 | static VNET_DEFINE(int, icmp_may_rst) = 1; |
---|
207 | #define V_icmp_may_rst VNET(icmp_may_rst) |
---|
208 | SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, |
---|
209 | &VNET_NAME(icmp_may_rst), 0, |
---|
210 | "Certain ICMP unreachable messages may abort connections in SYN_SENT"); |
---|
211 | |
---|
212 | static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; |
---|
213 | #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) |
---|
214 | SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, |
---|
215 | &VNET_NAME(tcp_isn_reseed_interval), 0, |
---|
216 | "Seconds between reseeding of ISN secret"); |
---|
217 | |
---|
218 | /* |
---|
219 | * TCP bandwidth limiting sysctls. Note that the default lower bound of |
---|
220 | * 1024 exists only for debugging. A good production default would be |
---|
221 | * something like 6100. |
---|
222 | */ |
---|
223 | SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, |
---|
224 | "TCP inflight data limiting"); |
---|
225 | |
---|
226 | static VNET_DEFINE(int, tcp_inflight_enable) = 0; |
---|
227 | #define V_tcp_inflight_enable VNET(tcp_inflight_enable) |
---|
228 | SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, |
---|
229 | &VNET_NAME(tcp_inflight_enable), 0, |
---|
230 | "Enable automatic TCP inflight data limiting"); |
---|
231 | |
---|
232 | static int tcp_inflight_debug = 0; |
---|
233 | SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, |
---|
234 | &tcp_inflight_debug, 0, |
---|
235 | "Debug TCP inflight calculations"); |
---|
236 | |
---|
237 | static VNET_DEFINE(int, tcp_inflight_rttthresh); |
---|
238 | #define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) |
---|
239 | SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, |
---|
240 | CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, |
---|
241 | vnet_sysctl_msec_to_ticks, "I", |
---|
242 | "RTT threshold below which inflight will deactivate itself"); |
---|
243 | |
---|
244 | static VNET_DEFINE(int, tcp_inflight_min) = 6144; |
---|
245 | #define V_tcp_inflight_min VNET(tcp_inflight_min) |
---|
246 | SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, |
---|
247 | &VNET_NAME(tcp_inflight_min), 0, |
---|
248 | "Lower-bound for TCP inflight window"); |
---|
249 | |
---|
250 | static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
---|
251 | #define V_tcp_inflight_max VNET(tcp_inflight_max) |
---|
252 | SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, |
---|
253 | &VNET_NAME(tcp_inflight_max), 0, |
---|
254 | "Upper-bound for TCP inflight window"); |
---|
255 | |
---|
256 | static VNET_DEFINE(int, tcp_inflight_stab) = 20; |
---|
257 | #define V_tcp_inflight_stab VNET(tcp_inflight_stab) |
---|
258 | SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, |
---|
259 | &VNET_NAME(tcp_inflight_stab), 0, |
---|
260 | "Inflight Algorithm Stabilization 20 = 2 packets"); |
---|
261 | |
---|
262 | VNET_DEFINE(uma_zone_t, sack_hole_zone); |
---|
263 | #define V_sack_hole_zone VNET(sack_hole_zone) |
---|
264 | |
---|
265 | static struct inpcb *tcp_notify(struct inpcb *, int); |
---|
266 | static void tcp_isn_tick(void *); |
---|
267 | static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, |
---|
268 | void *ip4hdr, const void *ip6hdr); |
---|
269 | |
---|
270 | /* |
---|
271 | * Target size of TCP PCB hash tables. Must be a power of two. |
---|
272 | * |
---|
273 | * Note that this can be overridden by the kernel environment |
---|
274 | * variable net.inet.tcp.tcbhashsize |
---|
275 | */ |
---|
276 | #ifndef TCBHASHSIZE |
---|
277 | #define TCBHASHSIZE 512 |
---|
278 | #endif |
---|
279 | |
---|
280 | /* |
---|
281 | * XXX |
---|
282 | * Callouts should be moved into struct tcp directly. They are currently |
---|
283 | * separate because the tcpcb structure is exported to userland for sysctl |
---|
284 | * parsing purposes, which do not know about callouts. |
---|
285 | */ |
---|
286 | struct tcpcb_mem { |
---|
287 | struct tcpcb tcb; |
---|
288 | struct tcp_timer tt; |
---|
289 | }; |
---|
290 | |
---|
291 | static VNET_DEFINE(uma_zone_t, tcpcb_zone); |
---|
292 | #define V_tcpcb_zone VNET(tcpcb_zone) |
---|
293 | |
---|
294 | MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); |
---|
295 | struct callout isn_callout; |
---|
296 | static struct mtx isn_mtx; |
---|
297 | |
---|
298 | #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) |
---|
299 | #define ISN_LOCK() mtx_lock(&isn_mtx) |
---|
300 | #define ISN_UNLOCK() mtx_unlock(&isn_mtx) |
---|
301 | |
---|
302 | /* |
---|
303 | * TCP initialization. |
---|
304 | */ |
---|
305 | static void |
---|
306 | tcp_zone_change(void *tag) |
---|
307 | { |
---|
308 | |
---|
309 | uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); |
---|
310 | uma_zone_set_max(V_tcpcb_zone, maxsockets); |
---|
311 | tcp_tw_zone_change(); |
---|
312 | } |
---|
313 | |
---|
314 | static int |
---|
315 | tcp_inpcb_init(void *mem, int size, int flags) |
---|
316 | { |
---|
317 | struct inpcb *inp = mem; |
---|
318 | |
---|
319 | INP_LOCK_INIT(inp, "inp", "tcpinp"); |
---|
320 | return (0); |
---|
321 | } |
---|
322 | |
---|
323 | void |
---|
324 | tcp_init(void) |
---|
325 | { |
---|
326 | int hashsize; |
---|
327 | |
---|
328 | INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); |
---|
329 | LIST_INIT(&V_tcb); |
---|
330 | #ifdef VIMAGE |
---|
331 | V_tcbinfo.ipi_vnet = curvnet; |
---|
332 | #endif |
---|
333 | V_tcbinfo.ipi_listhead = &V_tcb; |
---|
334 | hashsize = TCBHASHSIZE; |
---|
335 | TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); |
---|
336 | if (!powerof2(hashsize)) { |
---|
337 | printf("WARNING: TCB hash size not a power of 2\n"); |
---|
338 | hashsize = 512; /* safe default */ |
---|
339 | } |
---|
340 | V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, |
---|
341 | &V_tcbinfo.ipi_hashmask); |
---|
342 | V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, |
---|
343 | &V_tcbinfo.ipi_porthashmask); |
---|
344 | V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb), |
---|
345 | NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); |
---|
346 | uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); |
---|
347 | V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; |
---|
348 | |
---|
349 | /* |
---|
350 | * These have to be type stable for the benefit of the timers. |
---|
351 | */ |
---|
352 | V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), |
---|
353 | NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); |
---|
354 | uma_zone_set_max(V_tcpcb_zone, maxsockets); |
---|
355 | |
---|
356 | tcp_tw_init(); |
---|
357 | syncache_init(); |
---|
358 | tcp_hc_init(); |
---|
359 | tcp_reass_init(); |
---|
360 | |
---|
361 | TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); |
---|
362 | V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), |
---|
363 | NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); |
---|
364 | |
---|
365 | /* Skip initialization of globals for non-default instances. */ |
---|
366 | if (!IS_DEFAULT_VNET(curvnet)) |
---|
367 | return; |
---|
368 | |
---|
369 | /* XXX virtualize those bellow? */ |
---|
370 | tcp_delacktime = TCPTV_DELACK; |
---|
371 | tcp_keepinit = TCPTV_KEEP_INIT; |
---|
372 | tcp_keepidle = TCPTV_KEEP_IDLE; |
---|
373 | tcp_keepintvl = TCPTV_KEEPINTVL; |
---|
374 | tcp_maxpersistidle = TCPTV_KEEP_IDLE; |
---|
375 | tcp_msl = TCPTV_MSL; |
---|
376 | tcp_rexmit_min = TCPTV_MIN; |
---|
377 | if (tcp_rexmit_min < 1) |
---|
378 | tcp_rexmit_min = 1; |
---|
379 | tcp_rexmit_slop = TCPTV_CPU_VAR; |
---|
380 | tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; |
---|
381 | tcp_tcbhashsize = hashsize; |
---|
382 | |
---|
383 | #ifdef INET6 |
---|
384 | #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) |
---|
385 | #else /* INET6 */ |
---|
386 | #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) |
---|
387 | #endif /* INET6 */ |
---|
388 | if (max_protohdr < TCP_MINPROTOHDR) |
---|
389 | max_protohdr = TCP_MINPROTOHDR; |
---|
390 | if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) |
---|
391 | panic("tcp_init"); |
---|
392 | #undef TCP_MINPROTOHDR |
---|
393 | |
---|
394 | ISN_LOCK_INIT(); |
---|
395 | callout_init(&isn_callout, CALLOUT_MPSAFE); |
---|
396 | callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); |
---|
397 | EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, |
---|
398 | SHUTDOWN_PRI_DEFAULT); |
---|
399 | EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, |
---|
400 | EVENTHANDLER_PRI_ANY); |
---|
401 | } |
---|
402 | |
---|
403 | #ifdef VIMAGE |
---|
404 | void |
---|
405 | tcp_destroy(void) |
---|
406 | { |
---|
407 | |
---|
408 | tcp_reass_destroy(); |
---|
409 | tcp_hc_destroy(); |
---|
410 | syncache_destroy(); |
---|
411 | tcp_tw_destroy(); |
---|
412 | |
---|
413 | /* XXX check that hashes are empty! */ |
---|
414 | hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, |
---|
415 | V_tcbinfo.ipi_hashmask); |
---|
416 | hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, |
---|
417 | V_tcbinfo.ipi_porthashmask); |
---|
418 | |
---|
419 | uma_zdestroy(V_sack_hole_zone); |
---|
420 | uma_zdestroy(V_tcpcb_zone); |
---|
421 | uma_zdestroy(V_tcbinfo.ipi_zone); |
---|
422 | |
---|
423 | INP_INFO_LOCK_DESTROY(&V_tcbinfo); |
---|
424 | } |
---|
425 | #endif |
---|
426 | |
---|
427 | void |
---|
428 | tcp_fini(void *xtp) |
---|
429 | { |
---|
430 | |
---|
431 | callout_stop(&isn_callout); |
---|
432 | } |
---|
433 | |
---|
434 | /* |
---|
435 | * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. |
---|
436 | * tcp_template used to store this data in mbufs, but we now recopy it out |
---|
437 | * of the tcpcb each time to conserve mbufs. |
---|
438 | */ |
---|
439 | void |
---|
440 | tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) |
---|
441 | { |
---|
442 | struct tcphdr *th = (struct tcphdr *)tcp_ptr; |
---|
443 | |
---|
444 | INP_WLOCK_ASSERT(inp); |
---|
445 | |
---|
446 | #ifdef INET6 |
---|
447 | if ((inp->inp_vflag & INP_IPV6) != 0) { |
---|
448 | struct ip6_hdr *ip6; |
---|
449 | |
---|
450 | ip6 = (struct ip6_hdr *)ip_ptr; |
---|
451 | ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | |
---|
452 | (inp->inp_flow & IPV6_FLOWINFO_MASK); |
---|
453 | ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | |
---|
454 | (IPV6_VERSION & IPV6_VERSION_MASK); |
---|
455 | ip6->ip6_nxt = IPPROTO_TCP; |
---|
456 | ip6->ip6_plen = htons(sizeof(struct tcphdr)); |
---|
457 | ip6->ip6_src = inp->in6p_laddr; |
---|
458 | ip6->ip6_dst = inp->in6p_faddr; |
---|
459 | } else |
---|
460 | #endif |
---|
461 | { |
---|
462 | struct ip *ip; |
---|
463 | |
---|
464 | ip = (struct ip *)ip_ptr; |
---|
465 | ip->ip_v = IPVERSION; |
---|
466 | ip->ip_hl = 5; |
---|
467 | ip->ip_tos = inp->inp_ip_tos; |
---|
468 | ip->ip_len = 0; |
---|
469 | ip->ip_id = 0; |
---|
470 | ip->ip_off = 0; |
---|
471 | ip->ip_ttl = inp->inp_ip_ttl; |
---|
472 | ip->ip_sum = 0; |
---|
473 | ip->ip_p = IPPROTO_TCP; |
---|
474 | ip->ip_src = inp->inp_laddr; |
---|
475 | ip->ip_dst = inp->inp_faddr; |
---|
476 | } |
---|
477 | th->th_sport = inp->inp_lport; |
---|
478 | th->th_dport = inp->inp_fport; |
---|
479 | th->th_seq = 0; |
---|
480 | th->th_ack = 0; |
---|
481 | th->th_x2 = 0; |
---|
482 | th->th_off = 5; |
---|
483 | th->th_flags = 0; |
---|
484 | th->th_win = 0; |
---|
485 | th->th_urp = 0; |
---|
486 | th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ |
---|
487 | } |
---|
488 | |
---|
489 | /* |
---|
490 | * Create template to be used to send tcp packets on a connection. |
---|
491 | * Allocates an mbuf and fills in a skeletal tcp/ip header. The only |
---|
492 | * use for this function is in keepalives, which use tcp_respond. |
---|
493 | */ |
---|
494 | struct tcptemp * |
---|
495 | tcpip_maketemplate(struct inpcb *inp) |
---|
496 | { |
---|
497 | struct tcptemp *t; |
---|
498 | |
---|
499 | t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); |
---|
500 | if (t == NULL) |
---|
501 | return (NULL); |
---|
502 | tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); |
---|
503 | return (t); |
---|
504 | } |
---|
505 | |
---|
506 | /* |
---|
507 | * Send a single message to the TCP at address specified by |
---|
508 | * the given TCP/IP header. If m == NULL, then we make a copy |
---|
509 | * of the tcpiphdr at ti and send directly to the addressed host. |
---|
510 | * This is used to force keep alive messages out using the TCP |
---|
511 | * template for a connection. If flags are given then we send |
---|
512 | * a message back to the TCP which originated the * segment ti, |
---|
513 | * and discard the mbuf containing it and any other attached mbufs. |
---|
514 | * |
---|
515 | * In any case the ack and sequence number of the transmitted |
---|
516 | * segment are as specified by the parameters. |
---|
517 | * |
---|
518 | * NOTE: If m != NULL, then ti must point to *inside* the mbuf. |
---|
519 | */ |
---|
520 | void |
---|
521 | tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, |
---|
522 | tcp_seq ack, tcp_seq seq, int flags) |
---|
523 | { |
---|
524 | int tlen; |
---|
525 | int win = 0; |
---|
526 | struct ip *ip; |
---|
527 | struct tcphdr *nth; |
---|
528 | #ifdef INET6 |
---|
529 | struct ip6_hdr *ip6; |
---|
530 | int isipv6; |
---|
531 | #endif /* INET6 */ |
---|
532 | int ipflags = 0; |
---|
533 | struct inpcb *inp; |
---|
534 | |
---|
535 | KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); |
---|
536 | |
---|
537 | #ifdef INET6 |
---|
538 | isipv6 = ((struct ip *)ipgen)->ip_v == 6; |
---|
539 | ip6 = ipgen; |
---|
540 | #endif /* INET6 */ |
---|
541 | ip = ipgen; |
---|
542 | |
---|
543 | if (tp != NULL) { |
---|
544 | inp = tp->t_inpcb; |
---|
545 | KASSERT(inp != NULL, ("tcp control block w/o inpcb")); |
---|
546 | INP_WLOCK_ASSERT(inp); |
---|
547 | } else |
---|
548 | inp = NULL; |
---|
549 | |
---|
550 | if (tp != NULL) { |
---|
551 | if (!(flags & TH_RST)) { |
---|
552 | win = sbspace(&inp->inp_socket->so_rcv); |
---|
553 | if (win > (long)TCP_MAXWIN << tp->rcv_scale) |
---|
554 | win = (long)TCP_MAXWIN << tp->rcv_scale; |
---|
555 | } |
---|
556 | } |
---|
557 | if (m == NULL) { |
---|
558 | m = m_gethdr(M_DONTWAIT, MT_DATA); |
---|
559 | if (m == NULL) |
---|
560 | return; |
---|
561 | tlen = 0; |
---|
562 | m->m_data += max_linkhdr; |
---|
563 | #ifdef INET6 |
---|
564 | if (isipv6) { |
---|
565 | bcopy((caddr_t)ip6, mtod(m, caddr_t), |
---|
566 | sizeof(struct ip6_hdr)); |
---|
567 | ip6 = mtod(m, struct ip6_hdr *); |
---|
568 | nth = (struct tcphdr *)(ip6 + 1); |
---|
569 | } else |
---|
570 | #endif /* INET6 */ |
---|
571 | { |
---|
572 | bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); |
---|
573 | ip = mtod(m, struct ip *); |
---|
574 | nth = (struct tcphdr *)(ip + 1); |
---|
575 | } |
---|
576 | bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); |
---|
577 | flags = TH_ACK; |
---|
578 | } else { |
---|
579 | /* |
---|
580 | * reuse the mbuf. |
---|
581 | * XXX MRT We inherrit the FIB, which is lucky. |
---|
582 | */ |
---|
583 | m_freem(m->m_next); |
---|
584 | m->m_next = NULL; |
---|
585 | m->m_data = (caddr_t)ipgen; |
---|
586 | /* m_len is set later */ |
---|
587 | tlen = 0; |
---|
588 | #define xchg(a,b,type) { type t; t=a; a=b; b=t; } |
---|
589 | #ifdef INET6 |
---|
590 | if (isipv6) { |
---|
591 | xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); |
---|
592 | nth = (struct tcphdr *)(ip6 + 1); |
---|
593 | } else |
---|
594 | #endif /* INET6 */ |
---|
595 | { |
---|
596 | xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); |
---|
597 | nth = (struct tcphdr *)(ip + 1); |
---|
598 | } |
---|
599 | if (th != nth) { |
---|
600 | /* |
---|
601 | * this is usually a case when an extension header |
---|
602 | * exists between the IPv6 header and the |
---|
603 | * TCP header. |
---|
604 | */ |
---|
605 | nth->th_sport = th->th_sport; |
---|
606 | nth->th_dport = th->th_dport; |
---|
607 | } |
---|
608 | xchg(nth->th_dport, nth->th_sport, uint16_t); |
---|
609 | #undef xchg |
---|
610 | } |
---|
611 | #ifdef INET6 |
---|
612 | if (isipv6) { |
---|
613 | ip6->ip6_flow = 0; |
---|
614 | ip6->ip6_vfc = IPV6_VERSION; |
---|
615 | ip6->ip6_nxt = IPPROTO_TCP; |
---|
616 | ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + |
---|
617 | tlen)); |
---|
618 | tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); |
---|
619 | } else |
---|
620 | #endif |
---|
621 | { |
---|
622 | tlen += sizeof (struct tcpiphdr); |
---|
623 | ip->ip_len = tlen; |
---|
624 | ip->ip_ttl = V_ip_defttl; |
---|
625 | if (V_path_mtu_discovery) |
---|
626 | ip->ip_off |= IP_DF; |
---|
627 | } |
---|
628 | m->m_len = tlen; |
---|
629 | m->m_pkthdr.len = tlen; |
---|
630 | m->m_pkthdr.rcvif = NULL; |
---|
631 | #ifdef MAC |
---|
632 | if (inp != NULL) { |
---|
633 | /* |
---|
634 | * Packet is associated with a socket, so allow the |
---|
635 | * label of the response to reflect the socket label. |
---|
636 | */ |
---|
637 | INP_WLOCK_ASSERT(inp); |
---|
638 | mac_inpcb_create_mbuf(inp, m); |
---|
639 | } else { |
---|
640 | /* |
---|
641 | * Packet is not associated with a socket, so possibly |
---|
642 | * update the label in place. |
---|
643 | */ |
---|
644 | mac_netinet_tcp_reply(m); |
---|
645 | } |
---|
646 | #endif |
---|
647 | nth->th_seq = htonl(seq); |
---|
648 | nth->th_ack = htonl(ack); |
---|
649 | nth->th_x2 = 0; |
---|
650 | nth->th_off = sizeof (struct tcphdr) >> 2; |
---|
651 | nth->th_flags = flags; |
---|
652 | if (tp != NULL) |
---|
653 | nth->th_win = htons((u_short) (win >> tp->rcv_scale)); |
---|
654 | else |
---|
655 | nth->th_win = htons((u_short)win); |
---|
656 | nth->th_urp = 0; |
---|
657 | #ifdef INET6 |
---|
658 | if (isipv6) { |
---|
659 | nth->th_sum = 0; |
---|
660 | nth->th_sum = in6_cksum(m, IPPROTO_TCP, |
---|
661 | sizeof(struct ip6_hdr), |
---|
662 | tlen - sizeof(struct ip6_hdr)); |
---|
663 | ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : |
---|
664 | NULL, NULL); |
---|
665 | } else |
---|
666 | #endif /* INET6 */ |
---|
667 | { |
---|
668 | nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, |
---|
669 | htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); |
---|
670 | m->m_pkthdr.csum_flags = CSUM_TCP; |
---|
671 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); |
---|
672 | } |
---|
673 | #ifdef TCPDEBUG |
---|
674 | if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) |
---|
675 | tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); |
---|
676 | #endif |
---|
677 | #ifdef INET6 |
---|
678 | if (isipv6) |
---|
679 | (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); |
---|
680 | else |
---|
681 | #endif /* INET6 */ |
---|
682 | (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); |
---|
683 | } |
---|
684 | |
---|
685 | /* |
---|
686 | * Create a new TCP control block, making an |
---|
687 | * empty reassembly queue and hooking it to the argument |
---|
688 | * protocol control block. The `inp' parameter must have |
---|
689 | * come from the zone allocator set up in tcp_init(). |
---|
690 | */ |
---|
691 | struct tcpcb * |
---|
692 | tcp_newtcpcb(struct inpcb *inp) |
---|
693 | { |
---|
694 | struct tcpcb_mem *tm; |
---|
695 | struct tcpcb *tp; |
---|
696 | #ifdef INET6 |
---|
697 | int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; |
---|
698 | #endif /* INET6 */ |
---|
699 | |
---|
700 | tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); |
---|
701 | if (tm == NULL) |
---|
702 | return (NULL); |
---|
703 | tp = &tm->tcb; |
---|
704 | #ifdef VIMAGE |
---|
705 | tp->t_vnet = inp->inp_vnet; |
---|
706 | #endif |
---|
707 | tp->t_timers = &tm->tt; |
---|
708 | /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ |
---|
709 | tp->t_maxseg = tp->t_maxopd = |
---|
710 | #ifdef INET6 |
---|
711 | isipv6 ? V_tcp_v6mssdflt : |
---|
712 | #endif /* INET6 */ |
---|
713 | V_tcp_mssdflt; |
---|
714 | |
---|
715 | /* Set up our timeouts. */ |
---|
716 | callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); |
---|
717 | callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); |
---|
718 | callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); |
---|
719 | callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); |
---|
720 | callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); |
---|
721 | |
---|
722 | if (V_tcp_do_rfc1323) |
---|
723 | tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); |
---|
724 | if (V_tcp_do_sack) |
---|
725 | tp->t_flags |= TF_SACK_PERMIT; |
---|
726 | TAILQ_INIT(&tp->snd_holes); |
---|
727 | tp->t_inpcb = inp; /* XXX */ |
---|
728 | /* |
---|
729 | * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no |
---|
730 | * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives |
---|
731 | * reasonable initial retransmit time. |
---|
732 | */ |
---|
733 | tp->t_srtt = TCPTV_SRTTBASE; |
---|
734 | tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; |
---|
735 | tp->t_rttmin = tcp_rexmit_min; |
---|
736 | tp->t_rxtcur = TCPTV_RTOBASE; |
---|
737 | tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
---|
738 | tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
---|
739 | tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
---|
740 | tp->t_rcvtime = ticks; |
---|
741 | tp->t_bw_rtttime = ticks; |
---|
742 | /* |
---|
743 | * IPv4 TTL initialization is necessary for an IPv6 socket as well, |
---|
744 | * because the socket may be bound to an IPv6 wildcard address, |
---|
745 | * which may match an IPv4-mapped IPv6 address. |
---|
746 | */ |
---|
747 | inp->inp_ip_ttl = V_ip_defttl; |
---|
748 | inp->inp_ppcb = tp; |
---|
749 | return (tp); /* XXX */ |
---|
750 | } |
---|
751 | |
---|
752 | /* |
---|
753 | * Drop a TCP connection, reporting |
---|
754 | * the specified error. If connection is synchronized, |
---|
755 | * then send a RST to peer. |
---|
756 | */ |
---|
757 | struct tcpcb * |
---|
758 | tcp_drop(struct tcpcb *tp, int errno) |
---|
759 | { |
---|
760 | struct socket *so = tp->t_inpcb->inp_socket; |
---|
761 | |
---|
762 | INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
---|
763 | INP_WLOCK_ASSERT(tp->t_inpcb); |
---|
764 | |
---|
765 | if (TCPS_HAVERCVDSYN(tp->t_state)) { |
---|
766 | tp->t_state = TCPS_CLOSED; |
---|
767 | (void) tcp_output_reset(tp); |
---|
768 | TCPSTAT_INC(tcps_drops); |
---|
769 | } else |
---|
770 | TCPSTAT_INC(tcps_conndrops); |
---|
771 | if (errno == ETIMEDOUT && tp->t_softerror) |
---|
772 | errno = tp->t_softerror; |
---|
773 | so->so_error = errno; |
---|
774 | return (tcp_close(tp)); |
---|
775 | } |
---|
776 | |
---|
777 | void |
---|
778 | tcp_discardcb(struct tcpcb *tp) |
---|
779 | { |
---|
780 | struct inpcb *inp = tp->t_inpcb; |
---|
781 | struct socket *so = inp->inp_socket; |
---|
782 | #ifdef INET6 |
---|
783 | int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; |
---|
784 | #endif /* INET6 */ |
---|
785 | |
---|
786 | INP_WLOCK_ASSERT(inp); |
---|
787 | |
---|
788 | /* |
---|
789 | * Make sure that all of our timers are stopped before we |
---|
790 | * delete the PCB. |
---|
791 | */ |
---|
792 | callout_stop(&tp->t_timers->tt_rexmt); |
---|
793 | callout_stop(&tp->t_timers->tt_persist); |
---|
794 | callout_stop(&tp->t_timers->tt_keep); |
---|
795 | callout_stop(&tp->t_timers->tt_2msl); |
---|
796 | callout_stop(&tp->t_timers->tt_delack); |
---|
797 | |
---|
798 | /* |
---|
799 | * If we got enough samples through the srtt filter, |
---|
800 | * save the rtt and rttvar in the routing entry. |
---|
801 | * 'Enough' is arbitrarily defined as 4 rtt samples. |
---|
802 | * 4 samples is enough for the srtt filter to converge |
---|
803 | * to within enough % of the correct value; fewer samples |
---|
804 | * and we could save a bogus rtt. The danger is not high |
---|
805 | * as tcp quickly recovers from everything. |
---|
806 | * XXX: Works very well but needs some more statistics! |
---|
807 | */ |
---|
808 | if (tp->t_rttupdated >= 4) { |
---|
809 | struct hc_metrics_lite metrics; |
---|
810 | u_long ssthresh; |
---|
811 | |
---|
812 | bzero(&metrics, sizeof(metrics)); |
---|
813 | /* |
---|
814 | * Update the ssthresh always when the conditions below |
---|
815 | * are satisfied. This gives us better new start value |
---|
816 | * for the congestion avoidance for new connections. |
---|
817 | * ssthresh is only set if packet loss occured on a session. |
---|
818 | * |
---|
819 | * XXXRW: 'so' may be NULL here, and/or socket buffer may be |
---|
820 | * being torn down. Ideally this code would not use 'so'. |
---|
821 | */ |
---|
822 | ssthresh = tp->snd_ssthresh; |
---|
823 | if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { |
---|
824 | /* |
---|
825 | * convert the limit from user data bytes to |
---|
826 | * packets then to packet data bytes. |
---|
827 | */ |
---|
828 | ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; |
---|
829 | if (ssthresh < 2) |
---|
830 | ssthresh = 2; |
---|
831 | ssthresh *= (u_long)(tp->t_maxseg + |
---|
832 | #ifdef INET6 |
---|
833 | (isipv6 ? sizeof (struct ip6_hdr) + |
---|
834 | sizeof (struct tcphdr) : |
---|
835 | #endif |
---|
836 | sizeof (struct tcpiphdr) |
---|
837 | #ifdef INET6 |
---|
838 | ) |
---|
839 | #endif |
---|
840 | ); |
---|
841 | } else |
---|
842 | ssthresh = 0; |
---|
843 | metrics.rmx_ssthresh = ssthresh; |
---|
844 | |
---|
845 | metrics.rmx_rtt = tp->t_srtt; |
---|
846 | metrics.rmx_rttvar = tp->t_rttvar; |
---|
847 | /* XXX: This wraps if the pipe is more than 4 Gbit per second */ |
---|
848 | metrics.rmx_bandwidth = tp->snd_bandwidth; |
---|
849 | metrics.rmx_cwnd = tp->snd_cwnd; |
---|
850 | metrics.rmx_sendpipe = 0; |
---|
851 | metrics.rmx_recvpipe = 0; |
---|
852 | |
---|
853 | tcp_hc_update(&inp->inp_inc, &metrics); |
---|
854 | } |
---|
855 | |
---|
856 | /* free the reassembly queue, if any */ |
---|
857 | tcp_reass_flush(tp); |
---|
858 | /* Disconnect offload device, if any. */ |
---|
859 | tcp_offload_detach(tp); |
---|
860 | |
---|
861 | tcp_free_sackholes(tp); |
---|
862 | inp->inp_ppcb = NULL; |
---|
863 | tp->t_inpcb = NULL; |
---|
864 | uma_zfree(V_tcpcb_zone, tp); |
---|
865 | } |
---|
866 | |
---|
867 | /* |
---|
868 | * Attempt to close a TCP control block, marking it as dropped, and freeing |
---|
869 | * the socket if we hold the only reference. |
---|
870 | */ |
---|
871 | struct tcpcb * |
---|
872 | tcp_close(struct tcpcb *tp) |
---|
873 | { |
---|
874 | struct inpcb *inp = tp->t_inpcb; |
---|
875 | struct socket *so; |
---|
876 | |
---|
877 | INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
---|
878 | INP_WLOCK_ASSERT(inp); |
---|
879 | |
---|
880 | /* Notify any offload devices of listener close */ |
---|
881 | if (tp->t_state == TCPS_LISTEN) |
---|
882 | tcp_offload_listen_close(tp); |
---|
883 | in_pcbdrop(inp); |
---|
884 | TCPSTAT_INC(tcps_closed); |
---|
885 | KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); |
---|
886 | so = inp->inp_socket; |
---|
887 | soisdisconnected(so); |
---|
888 | if (inp->inp_flags & INP_SOCKREF) { |
---|
889 | KASSERT(so->so_state & SS_PROTOREF, |
---|
890 | ("tcp_close: !SS_PROTOREF")); |
---|
891 | inp->inp_flags &= ~INP_SOCKREF; |
---|
892 | INP_WUNLOCK(inp); |
---|
893 | ACCEPT_LOCK(); |
---|
894 | SOCK_LOCK(so); |
---|
895 | so->so_state &= ~SS_PROTOREF; |
---|
896 | sofree(so); |
---|
897 | return (NULL); |
---|
898 | } |
---|
899 | return (tp); |
---|
900 | } |
---|
901 | |
---|
902 | void |
---|
903 | tcp_drain(void) |
---|
904 | { |
---|
905 | VNET_ITERATOR_DECL(vnet_iter); |
---|
906 | |
---|
907 | if (!do_tcpdrain) |
---|
908 | return; |
---|
909 | |
---|
910 | VNET_LIST_RLOCK_NOSLEEP(); |
---|
911 | VNET_FOREACH(vnet_iter) { |
---|
912 | CURVNET_SET(vnet_iter); |
---|
913 | struct inpcb *inpb; |
---|
914 | struct tcpcb *tcpb; |
---|
915 | |
---|
916 | /* |
---|
917 | * Walk the tcpbs, if existing, and flush the reassembly queue, |
---|
918 | * if there is one... |
---|
919 | * XXX: The "Net/3" implementation doesn't imply that the TCP |
---|
920 | * reassembly queue should be flushed, but in a situation |
---|
921 | * where we're really low on mbufs, this is potentially |
---|
922 | * usefull. |
---|
923 | */ |
---|
924 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
925 | LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { |
---|
926 | if (inpb->inp_flags & INP_TIMEWAIT) |
---|
927 | continue; |
---|
928 | INP_WLOCK(inpb); |
---|
929 | if ((tcpb = intotcpcb(inpb)) != NULL) { |
---|
930 | tcp_reass_flush(tcpb); |
---|
931 | tcp_clean_sackreport(tcpb); |
---|
932 | } |
---|
933 | INP_WUNLOCK(inpb); |
---|
934 | } |
---|
935 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
936 | CURVNET_RESTORE(); |
---|
937 | } |
---|
938 | VNET_LIST_RUNLOCK_NOSLEEP(); |
---|
939 | } |
---|
940 | |
---|
941 | /* |
---|
942 | * Notify a tcp user of an asynchronous error; |
---|
943 | * store error as soft error, but wake up user |
---|
944 | * (for now, won't do anything until can select for soft error). |
---|
945 | * |
---|
946 | * Do not wake up user since there currently is no mechanism for |
---|
947 | * reporting soft errors (yet - a kqueue filter may be added). |
---|
948 | */ |
---|
949 | static struct inpcb * |
---|
950 | tcp_notify(struct inpcb *inp, int error) |
---|
951 | { |
---|
952 | struct tcpcb *tp; |
---|
953 | |
---|
954 | INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
---|
955 | INP_WLOCK_ASSERT(inp); |
---|
956 | |
---|
957 | if ((inp->inp_flags & INP_TIMEWAIT) || |
---|
958 | (inp->inp_flags & INP_DROPPED)) |
---|
959 | return (inp); |
---|
960 | |
---|
961 | tp = intotcpcb(inp); |
---|
962 | KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); |
---|
963 | |
---|
964 | /* |
---|
965 | * Ignore some errors if we are hooked up. |
---|
966 | * If connection hasn't completed, has retransmitted several times, |
---|
967 | * and receives a second error, give up now. This is better |
---|
968 | * than waiting a long time to establish a connection that |
---|
969 | * can never complete. |
---|
970 | */ |
---|
971 | if (tp->t_state == TCPS_ESTABLISHED && |
---|
972 | (error == EHOSTUNREACH || error == ENETUNREACH || |
---|
973 | error == EHOSTDOWN)) { |
---|
974 | return (inp); |
---|
975 | } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && |
---|
976 | tp->t_softerror) { |
---|
977 | tp = tcp_drop(tp, error); |
---|
978 | if (tp != NULL) |
---|
979 | return (inp); |
---|
980 | else |
---|
981 | return (NULL); |
---|
982 | } else { |
---|
983 | tp->t_softerror = error; |
---|
984 | return (inp); |
---|
985 | } |
---|
986 | #if 0 |
---|
987 | wakeup( &so->so_timeo); |
---|
988 | sorwakeup(so); |
---|
989 | sowwakeup(so); |
---|
990 | #endif |
---|
991 | } |
---|
992 | |
---|
993 | static int |
---|
994 | tcp_pcblist(SYSCTL_HANDLER_ARGS) |
---|
995 | { |
---|
996 | int error, i, m, n, pcb_count; |
---|
997 | struct inpcb *inp, **inp_list; |
---|
998 | inp_gen_t gencnt; |
---|
999 | struct xinpgen xig; |
---|
1000 | |
---|
1001 | /* |
---|
1002 | * The process of preparing the TCB list is too time-consuming and |
---|
1003 | * resource-intensive to repeat twice on every request. |
---|
1004 | */ |
---|
1005 | if (req->oldptr == NULL) { |
---|
1006 | n = V_tcbinfo.ipi_count + syncache_pcbcount(); |
---|
1007 | n += imax(n / 8, 10); |
---|
1008 | req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); |
---|
1009 | return (0); |
---|
1010 | } |
---|
1011 | |
---|
1012 | if (req->newptr != NULL) |
---|
1013 | return (EPERM); |
---|
1014 | |
---|
1015 | /* |
---|
1016 | * OK, now we're committed to doing something. |
---|
1017 | */ |
---|
1018 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
1019 | gencnt = V_tcbinfo.ipi_gencnt; |
---|
1020 | n = V_tcbinfo.ipi_count; |
---|
1021 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1022 | |
---|
1023 | m = syncache_pcbcount(); |
---|
1024 | |
---|
1025 | error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) |
---|
1026 | + (n + m) * sizeof(struct xtcpcb)); |
---|
1027 | if (error != 0) |
---|
1028 | return (error); |
---|
1029 | |
---|
1030 | xig.xig_len = sizeof xig; |
---|
1031 | xig.xig_count = n + m; |
---|
1032 | xig.xig_gen = gencnt; |
---|
1033 | xig.xig_sogen = so_gencnt; |
---|
1034 | error = SYSCTL_OUT(req, &xig, sizeof xig); |
---|
1035 | if (error) |
---|
1036 | return (error); |
---|
1037 | |
---|
1038 | error = syncache_pcblist(req, m, &pcb_count); |
---|
1039 | if (error) |
---|
1040 | return (error); |
---|
1041 | |
---|
1042 | inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); |
---|
1043 | if (inp_list == NULL) |
---|
1044 | return (ENOMEM); |
---|
1045 | |
---|
1046 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
1047 | for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; |
---|
1048 | inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { |
---|
1049 | INP_WLOCK(inp); |
---|
1050 | if (inp->inp_gencnt <= gencnt) { |
---|
1051 | /* |
---|
1052 | * XXX: This use of cr_cansee(), introduced with |
---|
1053 | * TCP state changes, is not quite right, but for |
---|
1054 | * now, better than nothing. |
---|
1055 | */ |
---|
1056 | if (inp->inp_flags & INP_TIMEWAIT) { |
---|
1057 | if (intotw(inp) != NULL) |
---|
1058 | error = cr_cansee(req->td->td_ucred, |
---|
1059 | intotw(inp)->tw_cred); |
---|
1060 | else |
---|
1061 | error = EINVAL; /* Skip this inp. */ |
---|
1062 | } else |
---|
1063 | error = cr_canseeinpcb(req->td->td_ucred, inp); |
---|
1064 | if (error == 0) { |
---|
1065 | in_pcbref(inp); |
---|
1066 | inp_list[i++] = inp; |
---|
1067 | } |
---|
1068 | } |
---|
1069 | INP_WUNLOCK(inp); |
---|
1070 | } |
---|
1071 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1072 | n = i; |
---|
1073 | |
---|
1074 | error = 0; |
---|
1075 | for (i = 0; i < n; i++) { |
---|
1076 | inp = inp_list[i]; |
---|
1077 | INP_RLOCK(inp); |
---|
1078 | if (inp->inp_gencnt <= gencnt) { |
---|
1079 | struct xtcpcb xt; |
---|
1080 | void *inp_ppcb; |
---|
1081 | |
---|
1082 | bzero(&xt, sizeof(xt)); |
---|
1083 | xt.xt_len = sizeof xt; |
---|
1084 | /* XXX should avoid extra copy */ |
---|
1085 | bcopy(inp, &xt.xt_inp, sizeof *inp); |
---|
1086 | inp_ppcb = inp->inp_ppcb; |
---|
1087 | if (inp_ppcb == NULL) |
---|
1088 | bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); |
---|
1089 | else if (inp->inp_flags & INP_TIMEWAIT) { |
---|
1090 | bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); |
---|
1091 | xt.xt_tp.t_state = TCPS_TIME_WAIT; |
---|
1092 | } else |
---|
1093 | bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); |
---|
1094 | if (inp->inp_socket != NULL) |
---|
1095 | sotoxsocket(inp->inp_socket, &xt.xt_socket); |
---|
1096 | else { |
---|
1097 | bzero(&xt.xt_socket, sizeof xt.xt_socket); |
---|
1098 | xt.xt_socket.xso_protocol = IPPROTO_TCP; |
---|
1099 | } |
---|
1100 | xt.xt_inp.inp_gencnt = inp->inp_gencnt; |
---|
1101 | INP_RUNLOCK(inp); |
---|
1102 | error = SYSCTL_OUT(req, &xt, sizeof xt); |
---|
1103 | } else |
---|
1104 | INP_RUNLOCK(inp); |
---|
1105 | } |
---|
1106 | INP_INFO_WLOCK(&V_tcbinfo); |
---|
1107 | for (i = 0; i < n; i++) { |
---|
1108 | inp = inp_list[i]; |
---|
1109 | INP_WLOCK(inp); |
---|
1110 | if (!in_pcbrele(inp)) |
---|
1111 | INP_WUNLOCK(inp); |
---|
1112 | } |
---|
1113 | INP_INFO_WUNLOCK(&V_tcbinfo); |
---|
1114 | |
---|
1115 | if (!error) { |
---|
1116 | /* |
---|
1117 | * Give the user an updated idea of our state. |
---|
1118 | * If the generation differs from what we told |
---|
1119 | * her before, she knows that something happened |
---|
1120 | * while we were processing this request, and it |
---|
1121 | * might be necessary to retry. |
---|
1122 | */ |
---|
1123 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
1124 | xig.xig_gen = V_tcbinfo.ipi_gencnt; |
---|
1125 | xig.xig_sogen = so_gencnt; |
---|
1126 | xig.xig_count = V_tcbinfo.ipi_count + pcb_count; |
---|
1127 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1128 | error = SYSCTL_OUT(req, &xig, sizeof xig); |
---|
1129 | } |
---|
1130 | free(inp_list, M_TEMP); |
---|
1131 | return (error); |
---|
1132 | } |
---|
1133 | |
---|
1134 | SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, |
---|
1135 | tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); |
---|
1136 | |
---|
1137 | static int |
---|
1138 | tcp_getcred(SYSCTL_HANDLER_ARGS) |
---|
1139 | { |
---|
1140 | struct xucred xuc; |
---|
1141 | struct sockaddr_in addrs[2]; |
---|
1142 | struct inpcb *inp; |
---|
1143 | int error; |
---|
1144 | |
---|
1145 | error = priv_check(req->td, PRIV_NETINET_GETCRED); |
---|
1146 | if (error) |
---|
1147 | return (error); |
---|
1148 | error = SYSCTL_IN(req, addrs, sizeof(addrs)); |
---|
1149 | if (error) |
---|
1150 | return (error); |
---|
1151 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
1152 | inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, |
---|
1153 | addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); |
---|
1154 | if (inp != NULL) { |
---|
1155 | INP_RLOCK(inp); |
---|
1156 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1157 | if (inp->inp_socket == NULL) |
---|
1158 | error = ENOENT; |
---|
1159 | if (error == 0) |
---|
1160 | error = cr_canseeinpcb(req->td->td_ucred, inp); |
---|
1161 | if (error == 0) |
---|
1162 | cru2x(inp->inp_cred, &xuc); |
---|
1163 | INP_RUNLOCK(inp); |
---|
1164 | } else { |
---|
1165 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1166 | error = ENOENT; |
---|
1167 | } |
---|
1168 | if (error == 0) |
---|
1169 | error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); |
---|
1170 | return (error); |
---|
1171 | } |
---|
1172 | |
---|
1173 | SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, |
---|
1174 | CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, |
---|
1175 | tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); |
---|
1176 | |
---|
1177 | #ifdef INET6 |
---|
1178 | static int |
---|
1179 | tcp6_getcred(SYSCTL_HANDLER_ARGS) |
---|
1180 | { |
---|
1181 | struct xucred xuc; |
---|
1182 | struct sockaddr_in6 addrs[2]; |
---|
1183 | struct inpcb *inp; |
---|
1184 | int error, mapped = 0; |
---|
1185 | |
---|
1186 | error = priv_check(req->td, PRIV_NETINET_GETCRED); |
---|
1187 | if (error) |
---|
1188 | return (error); |
---|
1189 | error = SYSCTL_IN(req, addrs, sizeof(addrs)); |
---|
1190 | if (error) |
---|
1191 | return (error); |
---|
1192 | if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || |
---|
1193 | (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { |
---|
1194 | return (error); |
---|
1195 | } |
---|
1196 | if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { |
---|
1197 | if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) |
---|
1198 | mapped = 1; |
---|
1199 | else |
---|
1200 | return (EINVAL); |
---|
1201 | } |
---|
1202 | |
---|
1203 | INP_INFO_RLOCK(&V_tcbinfo); |
---|
1204 | if (mapped == 1) |
---|
1205 | inp = in_pcblookup_hash(&V_tcbinfo, |
---|
1206 | *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], |
---|
1207 | addrs[1].sin6_port, |
---|
1208 | *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], |
---|
1209 | addrs[0].sin6_port, |
---|
1210 | 0, NULL); |
---|
1211 | else |
---|
1212 | inp = in6_pcblookup_hash(&V_tcbinfo, |
---|
1213 | &addrs[1].sin6_addr, addrs[1].sin6_port, |
---|
1214 | &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); |
---|
1215 | if (inp != NULL) { |
---|
1216 | INP_RLOCK(inp); |
---|
1217 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1218 | if (inp->inp_socket == NULL) |
---|
1219 | error = ENOENT; |
---|
1220 | if (error == 0) |
---|
1221 | error = cr_canseeinpcb(req->td->td_ucred, inp); |
---|
1222 | if (error == 0) |
---|
1223 | cru2x(inp->inp_cred, &xuc); |
---|
1224 | INP_RUNLOCK(inp); |
---|
1225 | } else { |
---|
1226 | INP_INFO_RUNLOCK(&V_tcbinfo); |
---|
1227 | error = ENOENT; |
---|
1228 | } |
---|
1229 | if (error == 0) |
---|
1230 | error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); |
---|
1231 | return (error); |
---|
1232 | } |
---|
1233 | |
---|
1234 | SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, |
---|
1235 | CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, |
---|
1236 | tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); |
---|
1237 | #endif |
---|
1238 | |
---|
1239 | |
---|
1240 | void |
---|
1241 | tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) |
---|
1242 | { |
---|
1243 | struct ip *ip = vip; |
---|
1244 | struct tcphdr *th; |
---|
1245 | struct in_addr faddr; |
---|
1246 | struct inpcb *inp; |
---|
1247 | struct tcpcb *tp; |
---|
1248 | struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; |
---|
1249 | struct icmp *icp; |
---|
1250 | struct in_conninfo inc; |
---|
1251 | tcp_seq icmp_tcp_seq; |
---|
1252 | int mtu; |
---|
1253 | |
---|
1254 | faddr = ((struct sockaddr_in *)sa)->sin_addr; |
---|
1255 | if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) |
---|
1256 | return; |
---|
1257 | |
---|
1258 | if (cmd == PRC_MSGSIZE) |
---|
1259 | notify = tcp_mtudisc; |
---|
1260 | else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || |
---|
1261 | cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) |
---|
1262 | notify = tcp_drop_syn_sent; |
---|
1263 | /* |
---|
1264 | * Redirects don't need to be handled up here. |
---|
1265 | */ |
---|
1266 | else if (PRC_IS_REDIRECT(cmd)) |
---|
1267 | return; |
---|
1268 | /* |
---|
1269 | * Source quench is depreciated. |
---|
1270 | */ |
---|
1271 | else if (cmd == PRC_QUENCH) |
---|
1272 | return; |
---|
1273 | /* |
---|
1274 | * Hostdead is ugly because it goes linearly through all PCBs. |
---|
1275 | * XXX: We never get this from ICMP, otherwise it makes an |
---|
1276 | * excellent DoS attack on machines with many connections. |
---|
1277 | */ |
---|
1278 | else if (cmd == PRC_HOSTDEAD) |
---|
1279 | ip = NULL; |
---|
1280 | else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) |
---|
1281 | return; |
---|
1282 | if (ip != NULL) { |
---|
1283 | icp = (struct icmp *)((caddr_t)ip |
---|
1284 | - offsetof(struct icmp, icmp_ip)); |
---|
1285 | th = (struct tcphdr *)((caddr_t)ip |
---|
1286 | + (ip->ip_hl << 2)); |
---|
1287 | INP_INFO_WLOCK(&V_tcbinfo); |
---|
1288 | inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, |
---|
1289 | ip->ip_src, th->th_sport, 0, NULL); |
---|
1290 | if (inp != NULL) { |
---|
1291 | INP_WLOCK(inp); |
---|
1292 | if (!(inp->inp_flags & INP_TIMEWAIT) && |
---|
1293 | !(inp->inp_flags & INP_DROPPED) && |
---|
1294 | !(inp->inp_socket == NULL)) { |
---|
1295 | icmp_tcp_seq = htonl(th->th_seq); |
---|
1296 | tp = intotcpcb(inp); |
---|
1297 | if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && |
---|
1298 | SEQ_LT(icmp_tcp_seq, tp->snd_max)) { |
---|
1299 | if (cmd == PRC_MSGSIZE) { |
---|
1300 | /* |
---|
1301 | * MTU discovery: |
---|
1302 | * If we got a needfrag set the MTU |
---|
1303 | * in the route to the suggested new |
---|
1304 | * value (if given) and then notify. |
---|
1305 | */ |
---|
1306 | bzero(&inc, sizeof(inc)); |
---|
1307 | inc.inc_faddr = faddr; |
---|
1308 | inc.inc_fibnum = |
---|
1309 | inp->inp_inc.inc_fibnum; |
---|
1310 | |
---|
1311 | mtu = ntohs(icp->icmp_nextmtu); |
---|
1312 | /* |
---|
1313 | * If no alternative MTU was |
---|
1314 | * proposed, try the next smaller |
---|
1315 | * one. ip->ip_len has already |
---|
1316 | * been swapped in icmp_input(). |
---|
1317 | */ |
---|
1318 | if (!mtu) |
---|
1319 | mtu = ip_next_mtu(ip->ip_len, |
---|
1320 | 1); |
---|
1321 | if (mtu < V_tcp_minmss |
---|
1322 | + sizeof(struct tcpiphdr)) |
---|
1323 | mtu = V_tcp_minmss |
---|
1324 | + sizeof(struct tcpiphdr); |
---|
1325 | /* |
---|
1326 | * Only cache the the MTU if it |
---|
1327 | * is smaller than the interface |
---|
1328 | * or route MTU. tcp_mtudisc() |
---|
1329 | * will do right thing by itself. |
---|
1330 | */ |
---|
1331 | if (mtu <= tcp_maxmtu(&inc, NULL)) |
---|
1332 | tcp_hc_updatemtu(&inc, mtu); |
---|
1333 | } |
---|
1334 | |
---|
1335 | inp = (*notify)(inp, inetctlerrmap[cmd]); |
---|
1336 | } |
---|
1337 | } |
---|
1338 | if (inp != NULL) |
---|
1339 | INP_WUNLOCK(inp); |
---|
1340 | } else { |
---|
1341 | bzero(&inc, sizeof(inc)); |
---|
1342 | inc.inc_fport = th->th_dport; |
---|
1343 | inc.inc_lport = th->th_sport; |
---|
1344 | inc.inc_faddr = faddr; |
---|
1345 | inc.inc_laddr = ip->ip_src; |
---|
1346 | syncache_unreach(&inc, th); |
---|
1347 | } |
---|
1348 | INP_INFO_WUNLOCK(&V_tcbinfo); |
---|
1349 | } else |
---|
1350 | in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); |
---|
1351 | } |
---|
1352 | |
---|
1353 | #ifdef INET6 |
---|
1354 | void |
---|
1355 | tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) |
---|
1356 | { |
---|
1357 | struct tcphdr th; |
---|
1358 | struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; |
---|
1359 | struct ip6_hdr *ip6; |
---|
1360 | struct mbuf *m; |
---|
1361 | struct ip6ctlparam *ip6cp = NULL; |
---|
1362 | const struct sockaddr_in6 *sa6_src = NULL; |
---|
1363 | int off; |
---|
1364 | struct tcp_portonly { |
---|
1365 | u_int16_t th_sport; |
---|
1366 | u_int16_t th_dport; |
---|
1367 | } *thp; |
---|
1368 | |
---|
1369 | if (sa->sa_family != AF_INET6 || |
---|
1370 | sa->sa_len != sizeof(struct sockaddr_in6)) |
---|
1371 | return; |
---|
1372 | |
---|
1373 | if (cmd == PRC_MSGSIZE) |
---|
1374 | notify = tcp_mtudisc; |
---|
1375 | else if (!PRC_IS_REDIRECT(cmd) && |
---|
1376 | ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) |
---|
1377 | return; |
---|
1378 | /* Source quench is depreciated. */ |
---|
1379 | else if (cmd == PRC_QUENCH) |
---|
1380 | return; |
---|
1381 | |
---|
1382 | /* if the parameter is from icmp6, decode it. */ |
---|
1383 | if (d != NULL) { |
---|
1384 | ip6cp = (struct ip6ctlparam *)d; |
---|
1385 | m = ip6cp->ip6c_m; |
---|
1386 | ip6 = ip6cp->ip6c_ip6; |
---|
1387 | off = ip6cp->ip6c_off; |
---|
1388 | sa6_src = ip6cp->ip6c_src; |
---|
1389 | } else { |
---|
1390 | m = NULL; |
---|
1391 | ip6 = NULL; |
---|
1392 | off = 0; /* fool gcc */ |
---|
1393 | sa6_src = &sa6_any; |
---|
1394 | } |
---|
1395 | |
---|
1396 | if (ip6 != NULL) { |
---|
1397 | struct in_conninfo inc; |
---|
1398 | /* |
---|
1399 | * XXX: We assume that when IPV6 is non NULL, |
---|
1400 | * M and OFF are valid. |
---|
1401 | */ |
---|
1402 | |
---|
1403 | /* check if we can safely examine src and dst ports */ |
---|
1404 | if (m->m_pkthdr.len < off + sizeof(*thp)) |
---|
1405 | return; |
---|
1406 | |
---|
1407 | bzero(&th, sizeof(th)); |
---|
1408 | m_copydata(m, off, sizeof(*thp), (caddr_t)&th); |
---|
1409 | |
---|
1410 | in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, |
---|
1411 | (struct sockaddr *)ip6cp->ip6c_src, |
---|
1412 | th.th_sport, cmd, NULL, notify); |
---|
1413 | |
---|
1414 | bzero(&inc, sizeof(inc)); |
---|
1415 | inc.inc_fport = th.th_dport; |
---|
1416 | inc.inc_lport = th.th_sport; |
---|
1417 | inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; |
---|
1418 | inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; |
---|
1419 | inc.inc_flags |= INC_ISIPV6; |
---|
1420 | INP_INFO_WLOCK(&V_tcbinfo); |
---|
1421 | syncache_unreach(&inc, &th); |
---|
1422 | INP_INFO_WUNLOCK(&V_tcbinfo); |
---|
1423 | } else |
---|
1424 | in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, |
---|
1425 | 0, cmd, NULL, notify); |
---|
1426 | } |
---|
1427 | #endif /* INET6 */ |
---|
1428 | |
---|
1429 | |
---|
1430 | /* |
---|
1431 | * Following is where TCP initial sequence number generation occurs. |
---|
1432 | * |
---|
1433 | * There are two places where we must use initial sequence numbers: |
---|
1434 | * 1. In SYN-ACK packets. |
---|
1435 | * 2. In SYN packets. |
---|
1436 | * |
---|
1437 | * All ISNs for SYN-ACK packets are generated by the syncache. See |
---|
1438 | * tcp_syncache.c for details. |
---|
1439 | * |
---|
1440 | * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling |
---|
1441 | * depends on this property. In addition, these ISNs should be |
---|
1442 | * unguessable so as to prevent connection hijacking. To satisfy |
---|
1443 | * the requirements of this situation, the algorithm outlined in |
---|
1444 | * RFC 1948 is used, with only small modifications. |
---|
1445 | * |
---|
1446 | * Implementation details: |
---|
1447 | * |
---|
1448 | * Time is based off the system timer, and is corrected so that it |
---|
1449 | * increases by one megabyte per second. This allows for proper |
---|
1450 | * recycling on high speed LANs while still leaving over an hour |
---|
1451 | * before rollover. |
---|
1452 | * |
---|
1453 | * As reading the *exact* system time is too expensive to be done |
---|
1454 | * whenever setting up a TCP connection, we increment the time |
---|
1455 | * offset in two ways. First, a small random positive increment |
---|
1456 | * is added to isn_offset for each connection that is set up. |
---|
1457 | * Second, the function tcp_isn_tick fires once per clock tick |
---|
1458 | * and increments isn_offset as necessary so that sequence numbers |
---|
1459 | * are incremented at approximately ISN_BYTES_PER_SECOND. The |
---|
1460 | * random positive increments serve only to ensure that the same |
---|
1461 | * exact sequence number is never sent out twice (as could otherwise |
---|
1462 | * happen when a port is recycled in less than the system tick |
---|
1463 | * interval.) |
---|
1464 | * |
---|
1465 | * net.inet.tcp.isn_reseed_interval controls the number of seconds |
---|
1466 | * between seeding of isn_secret. This is normally set to zero, |
---|
1467 | * as reseeding should not be necessary. |
---|
1468 | * |
---|
1469 | * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, |
---|
1470 | * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In |
---|
1471 | * general, this means holding an exclusive (write) lock. |
---|
1472 | */ |
---|
1473 | |
---|
1474 | #define ISN_BYTES_PER_SECOND 1048576 |
---|
1475 | #define ISN_STATIC_INCREMENT 4096 |
---|
1476 | #define ISN_RANDOM_INCREMENT (4096 - 1) |
---|
1477 | |
---|
1478 | static VNET_DEFINE(u_char, isn_secret[32]); |
---|
1479 | static VNET_DEFINE(int, isn_last_reseed); |
---|
1480 | static VNET_DEFINE(u_int32_t, isn_offset); |
---|
1481 | static VNET_DEFINE(u_int32_t, isn_offset_old); |
---|
1482 | |
---|
1483 | #define V_isn_secret VNET(isn_secret) |
---|
1484 | #define V_isn_last_reseed VNET(isn_last_reseed) |
---|
1485 | #define V_isn_offset VNET(isn_offset) |
---|
1486 | #define V_isn_offset_old VNET(isn_offset_old) |
---|
1487 | |
---|
1488 | tcp_seq |
---|
1489 | tcp_new_isn(struct tcpcb *tp) |
---|
1490 | { |
---|
1491 | MD5_CTX isn_ctx; |
---|
1492 | u_int32_t md5_buffer[4]; |
---|
1493 | tcp_seq new_isn; |
---|
1494 | |
---|
1495 | INP_WLOCK_ASSERT(tp->t_inpcb); |
---|
1496 | |
---|
1497 | ISN_LOCK(); |
---|
1498 | /* Seed if this is the first use, reseed if requested. */ |
---|
1499 | if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && |
---|
1500 | (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) |
---|
1501 | < (u_int)ticks))) { |
---|
1502 | read_random(&V_isn_secret, sizeof(V_isn_secret)); |
---|
1503 | V_isn_last_reseed = ticks; |
---|
1504 | } |
---|
1505 | |
---|
1506 | /* Compute the md5 hash and return the ISN. */ |
---|
1507 | MD5Init(&isn_ctx); |
---|
1508 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); |
---|
1509 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); |
---|
1510 | #ifdef INET6 |
---|
1511 | if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { |
---|
1512 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, |
---|
1513 | sizeof(struct in6_addr)); |
---|
1514 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, |
---|
1515 | sizeof(struct in6_addr)); |
---|
1516 | } else |
---|
1517 | #endif |
---|
1518 | { |
---|
1519 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, |
---|
1520 | sizeof(struct in_addr)); |
---|
1521 | MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, |
---|
1522 | sizeof(struct in_addr)); |
---|
1523 | } |
---|
1524 | MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); |
---|
1525 | MD5Final((u_char *) &md5_buffer, &isn_ctx); |
---|
1526 | new_isn = (tcp_seq) md5_buffer[0]; |
---|
1527 | V_isn_offset += ISN_STATIC_INCREMENT + |
---|
1528 | (arc4random() & ISN_RANDOM_INCREMENT); |
---|
1529 | new_isn += V_isn_offset; |
---|
1530 | ISN_UNLOCK(); |
---|
1531 | return (new_isn); |
---|
1532 | } |
---|
1533 | |
---|
1534 | /* |
---|
1535 | * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary |
---|
1536 | * to keep time flowing at a relatively constant rate. If the random |
---|
1537 | * increments have already pushed us past the projected offset, do nothing. |
---|
1538 | */ |
---|
1539 | static void |
---|
1540 | tcp_isn_tick(void *xtp) |
---|
1541 | { |
---|
1542 | VNET_ITERATOR_DECL(vnet_iter); |
---|
1543 | u_int32_t projected_offset; |
---|
1544 | |
---|
1545 | VNET_LIST_RLOCK_NOSLEEP(); |
---|
1546 | ISN_LOCK(); |
---|
1547 | VNET_FOREACH(vnet_iter) { |
---|
1548 | CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ |
---|
1549 | projected_offset = |
---|
1550 | V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; |
---|
1551 | |
---|
1552 | if (SEQ_GT(projected_offset, V_isn_offset)) |
---|
1553 | V_isn_offset = projected_offset; |
---|
1554 | |
---|
1555 | V_isn_offset_old = V_isn_offset; |
---|
1556 | CURVNET_RESTORE(); |
---|
1557 | } |
---|
1558 | ISN_UNLOCK(); |
---|
1559 | VNET_LIST_RUNLOCK_NOSLEEP(); |
---|
1560 | callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); |
---|
1561 | } |
---|
1562 | |
---|
1563 | /* |
---|
1564 | * When a specific ICMP unreachable message is received and the |
---|
1565 | * connection state is SYN-SENT, drop the connection. This behavior |
---|
1566 | * is controlled by the icmp_may_rst sysctl. |
---|
1567 | */ |
---|
1568 | struct inpcb * |
---|
1569 | tcp_drop_syn_sent(struct inpcb *inp, int errno) |
---|
1570 | { |
---|
1571 | struct tcpcb *tp; |
---|
1572 | |
---|
1573 | INP_INFO_WLOCK_ASSERT(&V_tcbinfo); |
---|
1574 | INP_WLOCK_ASSERT(inp); |
---|
1575 | |
---|
1576 | if ((inp->inp_flags & INP_TIMEWAIT) || |
---|
1577 | (inp->inp_flags & INP_DROPPED)) |
---|
1578 | return (inp); |
---|
1579 | |
---|
1580 | tp = intotcpcb(inp); |
---|
1581 | if (tp->t_state != TCPS_SYN_SENT) |
---|
1582 | return (inp); |
---|
1583 | |
---|
1584 | tp = tcp_drop(tp, errno); |
---|
1585 | if (tp != NULL) |
---|
1586 | return (inp); |
---|
1587 | else |
---|
1588 | return (NULL); |
---|
1589 | } |
---|
1590 | |
---|
1591 | /* |
---|
1592 | * When `need fragmentation' ICMP is received, update our idea of the MSS |
---|
1593 | * based on the new value in the route. Also nudge TCP to send something, |
---|
1594 | * since we know the packet we just sent was dropped. |
---|
1595 | * This duplicates some code in the tcp_mss() function in tcp_input.c. |
---|
1596 | */ |
---|
1597 | struct inpcb * |
---|
1598 | tcp_mtudisc(struct inpcb *inp, int errno) |
---|
1599 | { |
---|
1600 | struct tcpcb *tp; |
---|
1601 | struct socket *so; |
---|
1602 | |
---|
1603 | INP_WLOCK_ASSERT(inp); |
---|
1604 | if ((inp->inp_flags & INP_TIMEWAIT) || |
---|
1605 | (inp->inp_flags & INP_DROPPED)) |
---|
1606 | return (inp); |
---|
1607 | |
---|
1608 | tp = intotcpcb(inp); |
---|
1609 | KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); |
---|
1610 | |
---|
1611 | tcp_mss_update(tp, -1, NULL, NULL); |
---|
1612 | |
---|
1613 | so = inp->inp_socket; |
---|
1614 | SOCKBUF_LOCK(&so->so_snd); |
---|
1615 | /* If the mss is larger than the socket buffer, decrease the mss. */ |
---|
1616 | if (so->so_snd.sb_hiwat < tp->t_maxseg) |
---|
1617 | tp->t_maxseg = so->so_snd.sb_hiwat; |
---|
1618 | SOCKBUF_UNLOCK(&so->so_snd); |
---|
1619 | |
---|
1620 | TCPSTAT_INC(tcps_mturesent); |
---|
1621 | tp->t_rtttime = 0; |
---|
1622 | tp->snd_nxt = tp->snd_una; |
---|
1623 | tcp_free_sackholes(tp); |
---|
1624 | tp->snd_recover = tp->snd_max; |
---|
1625 | if (tp->t_flags & TF_SACK_PERMIT) |
---|
1626 | EXIT_FASTRECOVERY(tp); |
---|
1627 | tcp_output_send(tp); |
---|
1628 | return (inp); |
---|
1629 | } |
---|
1630 | |
---|
1631 | /* |
---|
1632 | * Look-up the routing entry to the peer of this inpcb. If no route |
---|
1633 | * is found and it cannot be allocated, then return 0. This routine |
---|
1634 | * is called by TCP routines that access the rmx structure and by |
---|
1635 | * tcp_mss_update to get the peer/interface MTU. |
---|
1636 | */ |
---|
1637 | u_long |
---|
1638 | tcp_maxmtu(struct in_conninfo *inc, int *flags) |
---|
1639 | { |
---|
1640 | struct route sro; |
---|
1641 | struct sockaddr_in *dst; |
---|
1642 | struct ifnet *ifp; |
---|
1643 | u_long maxmtu = 0; |
---|
1644 | |
---|
1645 | KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); |
---|
1646 | |
---|
1647 | bzero(&sro, sizeof(sro)); |
---|
1648 | if (inc->inc_faddr.s_addr != INADDR_ANY) { |
---|
1649 | dst = (struct sockaddr_in *)&sro.ro_dst; |
---|
1650 | dst->sin_family = AF_INET; |
---|
1651 | dst->sin_len = sizeof(*dst); |
---|
1652 | dst->sin_addr = inc->inc_faddr; |
---|
1653 | in_rtalloc_ign(&sro, 0, inc->inc_fibnum); |
---|
1654 | } |
---|
1655 | if (sro.ro_rt != NULL) { |
---|
1656 | ifp = sro.ro_rt->rt_ifp; |
---|
1657 | if (sro.ro_rt->rt_rmx.rmx_mtu == 0) |
---|
1658 | maxmtu = ifp->if_mtu; |
---|
1659 | else |
---|
1660 | maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); |
---|
1661 | |
---|
1662 | /* Report additional interface capabilities. */ |
---|
1663 | if (flags != NULL) { |
---|
1664 | if (ifp->if_capenable & IFCAP_TSO4 && |
---|
1665 | ifp->if_hwassist & CSUM_TSO) |
---|
1666 | *flags |= CSUM_TSO; |
---|
1667 | } |
---|
1668 | RTFREE(sro.ro_rt); |
---|
1669 | } |
---|
1670 | return (maxmtu); |
---|
1671 | } |
---|
1672 | |
---|
1673 | #ifdef INET6 |
---|
1674 | u_long |
---|
1675 | tcp_maxmtu6(struct in_conninfo *inc, int *flags) |
---|
1676 | { |
---|
1677 | struct route_in6 sro6; |
---|
1678 | struct ifnet *ifp; |
---|
1679 | u_long maxmtu = 0; |
---|
1680 | |
---|
1681 | KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); |
---|
1682 | |
---|
1683 | bzero(&sro6, sizeof(sro6)); |
---|
1684 | if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { |
---|
1685 | sro6.ro_dst.sin6_family = AF_INET6; |
---|
1686 | sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); |
---|
1687 | sro6.ro_dst.sin6_addr = inc->inc6_faddr; |
---|
1688 | rtalloc_ign((struct route *)&sro6, 0); |
---|
1689 | } |
---|
1690 | if (sro6.ro_rt != NULL) { |
---|
1691 | ifp = sro6.ro_rt->rt_ifp; |
---|
1692 | if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) |
---|
1693 | maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); |
---|
1694 | else |
---|
1695 | maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, |
---|
1696 | IN6_LINKMTU(sro6.ro_rt->rt_ifp)); |
---|
1697 | |
---|
1698 | /* Report additional interface capabilities. */ |
---|
1699 | if (flags != NULL) { |
---|
1700 | if (ifp->if_capenable & IFCAP_TSO6 && |
---|
1701 | ifp->if_hwassist & CSUM_TSO) |
---|
1702 | *flags |= CSUM_TSO; |
---|
1703 | } |
---|
1704 | RTFREE(sro6.ro_rt); |
---|
1705 | } |
---|
1706 | |
---|
1707 | return (maxmtu); |
---|
1708 | } |
---|
1709 | #endif /* INET6 */ |
---|
1710 | |
---|
1711 | #ifdef IPSEC |
---|
1712 | /* compute ESP/AH header size for TCP, including outer IP header. */ |
---|
1713 | size_t |
---|
1714 | ipsec_hdrsiz_tcp(struct tcpcb *tp) |
---|
1715 | { |
---|
1716 | struct inpcb *inp; |
---|
1717 | struct mbuf *m; |
---|
1718 | size_t hdrsiz; |
---|
1719 | struct ip *ip; |
---|
1720 | #ifdef INET6 |
---|
1721 | struct ip6_hdr *ip6; |
---|
1722 | #endif |
---|
1723 | struct tcphdr *th; |
---|
1724 | |
---|
1725 | if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) |
---|
1726 | return (0); |
---|
1727 | MGETHDR(m, M_DONTWAIT, MT_DATA); |
---|
1728 | if (!m) |
---|
1729 | return (0); |
---|
1730 | |
---|
1731 | #ifdef INET6 |
---|
1732 | if ((inp->inp_vflag & INP_IPV6) != 0) { |
---|
1733 | ip6 = mtod(m, struct ip6_hdr *); |
---|
1734 | th = (struct tcphdr *)(ip6 + 1); |
---|
1735 | m->m_pkthdr.len = m->m_len = |
---|
1736 | sizeof(struct ip6_hdr) + sizeof(struct tcphdr); |
---|
1737 | tcpip_fillheaders(inp, ip6, th); |
---|
1738 | hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); |
---|
1739 | } else |
---|
1740 | #endif /* INET6 */ |
---|
1741 | { |
---|
1742 | ip = mtod(m, struct ip *); |
---|
1743 | th = (struct tcphdr *)(ip + 1); |
---|
1744 | m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); |
---|
1745 | tcpip_fillheaders(inp, ip, th); |
---|
1746 | hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); |
---|
1747 | } |
---|
1748 | |
---|
1749 | m_free(m); |
---|
1750 | return (hdrsiz); |
---|
1751 | } |
---|
1752 | #endif /* IPSEC */ |
---|
1753 | |
---|
1754 | /* |
---|
1755 | * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING |
---|
1756 | * |
---|
1757 | * This code attempts to calculate the bandwidth-delay product as a |
---|
1758 | * means of determining the optimal window size to maximize bandwidth, |
---|
1759 | * minimize RTT, and avoid the over-allocation of buffers on interfaces and |
---|
1760 | * routers. This code also does a fairly good job keeping RTTs in check |
---|
1761 | * across slow links like modems. We implement an algorithm which is very |
---|
1762 | * similar (but not meant to be) TCP/Vegas. The code operates on the |
---|
1763 | * transmitter side of a TCP connection and so only effects the transmit |
---|
1764 | * side of the connection. |
---|
1765 | * |
---|
1766 | * BACKGROUND: TCP makes no provision for the management of buffer space |
---|
1767 | * at the end points or at the intermediate routers and switches. A TCP |
---|
1768 | * stream, whether using NewReno or not, will eventually buffer as |
---|
1769 | * many packets as it is able and the only reason this typically works is |
---|
1770 | * due to the fairly small default buffers made available for a connection |
---|
1771 | * (typicaly 16K or 32K). As machines use larger windows and/or window |
---|
1772 | * scaling it is now fairly easy for even a single TCP connection to blow-out |
---|
1773 | * all available buffer space not only on the local interface, but on |
---|
1774 | * intermediate routers and switches as well. NewReno makes a misguided |
---|
1775 | * attempt to 'solve' this problem by waiting for an actual failure to occur, |
---|
1776 | * then backing off, then steadily increasing the window again until another |
---|
1777 | * failure occurs, ad-infinitum. This results in terrible oscillation that |
---|
1778 | * is only made worse as network loads increase and the idea of intentionally |
---|
1779 | * blowing out network buffers is, frankly, a terrible way to manage network |
---|
1780 | * resources. |
---|
1781 | * |
---|
1782 | * It is far better to limit the transmit window prior to the failure |
---|
1783 | * condition being achieved. There are two general ways to do this: First |
---|
1784 | * you can 'scan' through different transmit window sizes and locate the |
---|
1785 | * point where the RTT stops increasing, indicating that you have filled the |
---|
1786 | * pipe, then scan backwards until you note that RTT stops decreasing, then |
---|
1787 | * repeat ad-infinitum. This method works in principle but has severe |
---|
1788 | * implementation issues due to RTT variances, timer granularity, and |
---|
1789 | * instability in the algorithm which can lead to many false positives and |
---|
1790 | * create oscillations as well as interact badly with other TCP streams |
---|
1791 | * implementing the same algorithm. |
---|
1792 | * |
---|
1793 | * The second method is to limit the window to the bandwidth delay product |
---|
1794 | * of the link. This is the method we implement. RTT variances and our |
---|
1795 | * own manipulation of the congestion window, bwnd, can potentially |
---|
1796 | * destabilize the algorithm. For this reason we have to stabilize the |
---|
1797 | * elements used to calculate the window. We do this by using the minimum |
---|
1798 | * observed RTT, the long term average of the observed bandwidth, and |
---|
1799 | * by adding two segments worth of slop. It isn't perfect but it is able |
---|
1800 | * to react to changing conditions and gives us a very stable basis on |
---|
1801 | * which to extend the algorithm. |
---|
1802 | */ |
---|
1803 | void |
---|
1804 | tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) |
---|
1805 | { |
---|
1806 | u_long bw; |
---|
1807 | u_long bwnd; |
---|
1808 | int save_ticks; |
---|
1809 | |
---|
1810 | INP_WLOCK_ASSERT(tp->t_inpcb); |
---|
1811 | |
---|
1812 | /* |
---|
1813 | * If inflight_enable is disabled in the middle of a tcp connection, |
---|
1814 | * make sure snd_bwnd is effectively disabled. |
---|
1815 | */ |
---|
1816 | if (V_tcp_inflight_enable == 0 || |
---|
1817 | tp->t_rttlow < V_tcp_inflight_rttthresh) { |
---|
1818 | tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
---|
1819 | tp->snd_bandwidth = 0; |
---|
1820 | return; |
---|
1821 | } |
---|
1822 | |
---|
1823 | /* |
---|
1824 | * Figure out the bandwidth. Due to the tick granularity this |
---|
1825 | * is a very rough number and it MUST be averaged over a fairly |
---|
1826 | * long period of time. XXX we need to take into account a link |
---|
1827 | * that is not using all available bandwidth, but for now our |
---|
1828 | * slop will ramp us up if this case occurs and the bandwidth later |
---|
1829 | * increases. |
---|
1830 | * |
---|
1831 | * Note: if ticks rollover 'bw' may wind up negative. We must |
---|
1832 | * effectively reset t_bw_rtttime for this case. |
---|
1833 | */ |
---|
1834 | save_ticks = ticks; |
---|
1835 | if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) |
---|
1836 | return; |
---|
1837 | |
---|
1838 | bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / |
---|
1839 | (save_ticks - tp->t_bw_rtttime); |
---|
1840 | tp->t_bw_rtttime = save_ticks; |
---|
1841 | tp->t_bw_rtseq = ack_seq; |
---|
1842 | if (tp->t_bw_rtttime == 0 || (int)bw < 0) |
---|
1843 | return; |
---|
1844 | bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; |
---|
1845 | |
---|
1846 | tp->snd_bandwidth = bw; |
---|
1847 | |
---|
1848 | /* |
---|
1849 | * Calculate the semi-static bandwidth delay product, plus two maximal |
---|
1850 | * segments. The additional slop puts us squarely in the sweet |
---|
1851 | * spot and also handles the bandwidth run-up case and stabilization. |
---|
1852 | * Without the slop we could be locking ourselves into a lower |
---|
1853 | * bandwidth. |
---|
1854 | * |
---|
1855 | * Situations Handled: |
---|
1856 | * (1) Prevents over-queueing of packets on LANs, especially on |
---|
1857 | * high speed LANs, allowing larger TCP buffers to be |
---|
1858 | * specified, and also does a good job preventing |
---|
1859 | * over-queueing of packets over choke points like modems |
---|
1860 | * (at least for the transmit side). |
---|
1861 | * |
---|
1862 | * (2) Is able to handle changing network loads (bandwidth |
---|
1863 | * drops so bwnd drops, bandwidth increases so bwnd |
---|
1864 | * increases). |
---|
1865 | * |
---|
1866 | * (3) Theoretically should stabilize in the face of multiple |
---|
1867 | * connections implementing the same algorithm (this may need |
---|
1868 | * a little work). |
---|
1869 | * |
---|
1870 | * (4) Stability value (defaults to 20 = 2 maximal packets) can |
---|
1871 | * be adjusted with a sysctl but typically only needs to be |
---|
1872 | * on very slow connections. A value no smaller then 5 |
---|
1873 | * should be used, but only reduce this default if you have |
---|
1874 | * no other choice. |
---|
1875 | */ |
---|
1876 | #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) |
---|
1877 | bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; |
---|
1878 | #undef USERTT |
---|
1879 | |
---|
1880 | if (tcp_inflight_debug > 0) { |
---|
1881 | static int ltime; |
---|
1882 | if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { |
---|
1883 | ltime = ticks; |
---|
1884 | printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", |
---|
1885 | tp, |
---|
1886 | bw, |
---|
1887 | tp->t_rttbest, |
---|
1888 | tp->t_srtt, |
---|
1889 | bwnd |
---|
1890 | ); |
---|
1891 | } |
---|
1892 | } |
---|
1893 | if ((long)bwnd < V_tcp_inflight_min) |
---|
1894 | bwnd = V_tcp_inflight_min; |
---|
1895 | if (bwnd > V_tcp_inflight_max) |
---|
1896 | bwnd = V_tcp_inflight_max; |
---|
1897 | if ((long)bwnd < tp->t_maxseg * 2) |
---|
1898 | bwnd = tp->t_maxseg * 2; |
---|
1899 | tp->snd_bwnd = bwnd; |
---|
1900 | } |
---|
1901 | |
---|
1902 | #ifdef TCP_SIGNATURE |
---|
1903 | /* |
---|
1904 | * Callback function invoked by m_apply() to digest TCP segment data |
---|
1905 | * contained within an mbuf chain. |
---|
1906 | */ |
---|
1907 | static int |
---|
1908 | tcp_signature_apply(void *fstate, void *data, u_int len) |
---|
1909 | { |
---|
1910 | |
---|
1911 | MD5Update(fstate, (u_char *)data, len); |
---|
1912 | return (0); |
---|
1913 | } |
---|
1914 | |
---|
1915 | /* |
---|
1916 | * Compute TCP-MD5 hash of a TCP segment. (RFC2385) |
---|
1917 | * |
---|
1918 | * Parameters: |
---|
1919 | * m pointer to head of mbuf chain |
---|
1920 | * _unused |
---|
1921 | * len length of TCP segment data, excluding options |
---|
1922 | * optlen length of TCP segment options |
---|
1923 | * buf pointer to storage for computed MD5 digest |
---|
1924 | * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) |
---|
1925 | * |
---|
1926 | * We do this over ip, tcphdr, segment data, and the key in the SADB. |
---|
1927 | * When called from tcp_input(), we can be sure that th_sum has been |
---|
1928 | * zeroed out and verified already. |
---|
1929 | * |
---|
1930 | * Return 0 if successful, otherwise return -1. |
---|
1931 | * |
---|
1932 | * XXX The key is retrieved from the system's PF_KEY SADB, by keying a |
---|
1933 | * search with the destination IP address, and a 'magic SPI' to be |
---|
1934 | * determined by the application. This is hardcoded elsewhere to 1179 |
---|
1935 | * right now. Another branch of this code exists which uses the SPD to |
---|
1936 | * specify per-application flows but it is unstable. |
---|
1937 | */ |
---|
1938 | int |
---|
1939 | tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, |
---|
1940 | u_char *buf, u_int direction) |
---|
1941 | { |
---|
1942 | union sockaddr_union dst; |
---|
1943 | struct ippseudo ippseudo; |
---|
1944 | MD5_CTX ctx; |
---|
1945 | int doff; |
---|
1946 | struct ip *ip; |
---|
1947 | struct ipovly *ipovly; |
---|
1948 | struct secasvar *sav; |
---|
1949 | struct tcphdr *th; |
---|
1950 | #ifdef INET6 |
---|
1951 | struct ip6_hdr *ip6; |
---|
1952 | struct in6_addr in6; |
---|
1953 | char ip6buf[INET6_ADDRSTRLEN]; |
---|
1954 | uint32_t plen; |
---|
1955 | uint16_t nhdr; |
---|
1956 | #endif |
---|
1957 | u_short savecsum; |
---|
1958 | |
---|
1959 | KASSERT(m != NULL, ("NULL mbuf chain")); |
---|
1960 | KASSERT(buf != NULL, ("NULL signature pointer")); |
---|
1961 | |
---|
1962 | /* Extract the destination from the IP header in the mbuf. */ |
---|
1963 | bzero(&dst, sizeof(union sockaddr_union)); |
---|
1964 | ip = mtod(m, struct ip *); |
---|
1965 | #ifdef INET6 |
---|
1966 | ip6 = NULL; /* Make the compiler happy. */ |
---|
1967 | #endif |
---|
1968 | switch (ip->ip_v) { |
---|
1969 | case IPVERSION: |
---|
1970 | dst.sa.sa_len = sizeof(struct sockaddr_in); |
---|
1971 | dst.sa.sa_family = AF_INET; |
---|
1972 | dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? |
---|
1973 | ip->ip_src : ip->ip_dst; |
---|
1974 | break; |
---|
1975 | #ifdef INET6 |
---|
1976 | case (IPV6_VERSION >> 4): |
---|
1977 | ip6 = mtod(m, struct ip6_hdr *); |
---|
1978 | dst.sa.sa_len = sizeof(struct sockaddr_in6); |
---|
1979 | dst.sa.sa_family = AF_INET6; |
---|
1980 | dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? |
---|
1981 | ip6->ip6_src : ip6->ip6_dst; |
---|
1982 | break; |
---|
1983 | #endif |
---|
1984 | default: |
---|
1985 | return (EINVAL); |
---|
1986 | /* NOTREACHED */ |
---|
1987 | break; |
---|
1988 | } |
---|
1989 | |
---|
1990 | /* Look up an SADB entry which matches the address of the peer. */ |
---|
1991 | sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); |
---|
1992 | if (sav == NULL) { |
---|
1993 | ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, |
---|
1994 | (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : |
---|
1995 | #ifdef INET6 |
---|
1996 | (ip->ip_v == (IPV6_VERSION >> 4)) ? |
---|
1997 | ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : |
---|
1998 | #endif |
---|
1999 | "(unsupported)")); |
---|
2000 | return (EINVAL); |
---|
2001 | } |
---|
2002 | |
---|
2003 | MD5Init(&ctx); |
---|
2004 | /* |
---|
2005 | * Step 1: Update MD5 hash with IP(v6) pseudo-header. |
---|
2006 | * |
---|
2007 | * XXX The ippseudo header MUST be digested in network byte order, |
---|
2008 | * or else we'll fail the regression test. Assume all fields we've |
---|
2009 | * been doing arithmetic on have been in host byte order. |
---|
2010 | * XXX One cannot depend on ipovly->ih_len here. When called from |
---|
2011 | * tcp_output(), the underlying ip_len member has not yet been set. |
---|
2012 | */ |
---|
2013 | switch (ip->ip_v) { |
---|
2014 | case IPVERSION: |
---|
2015 | ipovly = (struct ipovly *)ip; |
---|
2016 | ippseudo.ippseudo_src = ipovly->ih_src; |
---|
2017 | ippseudo.ippseudo_dst = ipovly->ih_dst; |
---|
2018 | ippseudo.ippseudo_pad = 0; |
---|
2019 | ippseudo.ippseudo_p = IPPROTO_TCP; |
---|
2020 | ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + |
---|
2021 | optlen); |
---|
2022 | MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); |
---|
2023 | |
---|
2024 | th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); |
---|
2025 | doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; |
---|
2026 | break; |
---|
2027 | #ifdef INET6 |
---|
2028 | /* |
---|
2029 | * RFC 2385, 2.0 Proposal |
---|
2030 | * For IPv6, the pseudo-header is as described in RFC 2460, namely the |
---|
2031 | * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- |
---|
2032 | * extended next header value (to form 32 bits), and 32-bit segment |
---|
2033 | * length. |
---|
2034 | * Note: Upper-Layer Packet Length comes before Next Header. |
---|
2035 | */ |
---|
2036 | case (IPV6_VERSION >> 4): |
---|
2037 | in6 = ip6->ip6_src; |
---|
2038 | in6_clearscope(&in6); |
---|
2039 | MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); |
---|
2040 | in6 = ip6->ip6_dst; |
---|
2041 | in6_clearscope(&in6); |
---|
2042 | MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); |
---|
2043 | plen = htonl(len + sizeof(struct tcphdr) + optlen); |
---|
2044 | MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); |
---|
2045 | nhdr = 0; |
---|
2046 | MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
---|
2047 | MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
---|
2048 | MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
---|
2049 | nhdr = IPPROTO_TCP; |
---|
2050 | MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); |
---|
2051 | |
---|
2052 | th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); |
---|
2053 | doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; |
---|
2054 | break; |
---|
2055 | #endif |
---|
2056 | default: |
---|
2057 | return (EINVAL); |
---|
2058 | /* NOTREACHED */ |
---|
2059 | break; |
---|
2060 | } |
---|
2061 | |
---|
2062 | |
---|
2063 | /* |
---|
2064 | * Step 2: Update MD5 hash with TCP header, excluding options. |
---|
2065 | * The TCP checksum must be set to zero. |
---|
2066 | */ |
---|
2067 | savecsum = th->th_sum; |
---|
2068 | th->th_sum = 0; |
---|
2069 | MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); |
---|
2070 | th->th_sum = savecsum; |
---|
2071 | |
---|
2072 | /* |
---|
2073 | * Step 3: Update MD5 hash with TCP segment data. |
---|
2074 | * Use m_apply() to avoid an early m_pullup(). |
---|
2075 | */ |
---|
2076 | if (len > 0) |
---|
2077 | m_apply(m, doff, len, tcp_signature_apply, &ctx); |
---|
2078 | |
---|
2079 | /* |
---|
2080 | * Step 4: Update MD5 hash with shared secret. |
---|
2081 | */ |
---|
2082 | MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); |
---|
2083 | MD5Final(buf, &ctx); |
---|
2084 | |
---|
2085 | key_sa_recordxfer(sav, m); |
---|
2086 | KEY_FREESAV(&sav); |
---|
2087 | return (0); |
---|
2088 | } |
---|
2089 | #endif /* TCP_SIGNATURE */ |
---|
2090 | |
---|
2091 | static int |
---|
2092 | sysctl_drop(SYSCTL_HANDLER_ARGS) |
---|
2093 | { |
---|
2094 | /* addrs[0] is a foreign socket, addrs[1] is a local one. */ |
---|
2095 | struct sockaddr_storage addrs[2]; |
---|
2096 | struct inpcb *inp; |
---|
2097 | struct tcpcb *tp; |
---|
2098 | struct tcptw *tw; |
---|
2099 | struct sockaddr_in *fin, *lin; |
---|
2100 | #ifdef INET6 |
---|
2101 | struct sockaddr_in6 *fin6, *lin6; |
---|
2102 | #endif |
---|
2103 | int error; |
---|
2104 | |
---|
2105 | inp = NULL; |
---|
2106 | fin = lin = NULL; |
---|
2107 | #ifdef INET6 |
---|
2108 | fin6 = lin6 = NULL; |
---|
2109 | #endif |
---|
2110 | error = 0; |
---|
2111 | |
---|
2112 | if (req->oldptr != NULL || req->oldlen != 0) |
---|
2113 | return (EINVAL); |
---|
2114 | if (req->newptr == NULL) |
---|
2115 | return (EPERM); |
---|
2116 | if (req->newlen < sizeof(addrs)) |
---|
2117 | return (ENOMEM); |
---|
2118 | error = SYSCTL_IN(req, &addrs, sizeof(addrs)); |
---|
2119 | if (error) |
---|
2120 | return (error); |
---|
2121 | |
---|
2122 | switch (addrs[0].ss_family) { |
---|
2123 | #ifdef INET6 |
---|
2124 | case AF_INET6: |
---|
2125 | fin6 = (struct sockaddr_in6 *)&addrs[0]; |
---|
2126 | lin6 = (struct sockaddr_in6 *)&addrs[1]; |
---|
2127 | if (fin6->sin6_len != sizeof(struct sockaddr_in6) || |
---|
2128 | lin6->sin6_len != sizeof(struct sockaddr_in6)) |
---|
2129 | return (EINVAL); |
---|
2130 | if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { |
---|
2131 | if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) |
---|
2132 | return (EINVAL); |
---|
2133 | in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); |
---|
2134 | in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); |
---|
2135 | fin = (struct sockaddr_in *)&addrs[0]; |
---|
2136 | lin = (struct sockaddr_in *)&addrs[1]; |
---|
2137 | break; |
---|
2138 | } |
---|
2139 | error = sa6_embedscope(fin6, V_ip6_use_defzone); |
---|
2140 | if (error) |
---|
2141 | return (error); |
---|
2142 | error = sa6_embedscope(lin6, V_ip6_use_defzone); |
---|
2143 | if (error) |
---|
2144 | return (error); |
---|
2145 | break; |
---|
2146 | #endif |
---|
2147 | case AF_INET: |
---|
2148 | fin = (struct sockaddr_in *)&addrs[0]; |
---|
2149 | lin = (struct sockaddr_in *)&addrs[1]; |
---|
2150 | if (fin->sin_len != sizeof(struct sockaddr_in) || |
---|
2151 | lin->sin_len != sizeof(struct sockaddr_in)) |
---|
2152 | return (EINVAL); |
---|
2153 | break; |
---|
2154 | default: |
---|
2155 | return (EINVAL); |
---|
2156 | } |
---|
2157 | INP_INFO_WLOCK(&V_tcbinfo); |
---|
2158 | switch (addrs[0].ss_family) { |
---|
2159 | #ifdef INET6 |
---|
2160 | case AF_INET6: |
---|
2161 | inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, |
---|
2162 | fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, |
---|
2163 | NULL); |
---|
2164 | break; |
---|
2165 | #endif |
---|
2166 | case AF_INET: |
---|
2167 | inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, |
---|
2168 | fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); |
---|
2169 | break; |
---|
2170 | } |
---|
2171 | if (inp != NULL) { |
---|
2172 | INP_WLOCK(inp); |
---|
2173 | if (inp->inp_flags & INP_TIMEWAIT) { |
---|
2174 | /* |
---|
2175 | * XXXRW: There currently exists a state where an |
---|
2176 | * inpcb is present, but its timewait state has been |
---|
2177 | * discarded. For now, don't allow dropping of this |
---|
2178 | * type of inpcb. |
---|
2179 | */ |
---|
2180 | tw = intotw(inp); |
---|
2181 | if (tw != NULL) |
---|
2182 | tcp_twclose(tw, 0); |
---|
2183 | else |
---|
2184 | INP_WUNLOCK(inp); |
---|
2185 | } else if (!(inp->inp_flags & INP_DROPPED) && |
---|
2186 | !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { |
---|
2187 | tp = intotcpcb(inp); |
---|
2188 | tp = tcp_drop(tp, ECONNABORTED); |
---|
2189 | if (tp != NULL) |
---|
2190 | INP_WUNLOCK(inp); |
---|
2191 | } else |
---|
2192 | INP_WUNLOCK(inp); |
---|
2193 | } else |
---|
2194 | error = ESRCH; |
---|
2195 | INP_INFO_WUNLOCK(&V_tcbinfo); |
---|
2196 | return (error); |
---|
2197 | } |
---|
2198 | |
---|
2199 | SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, |
---|
2200 | CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, |
---|
2201 | 0, sysctl_drop, "", "Drop TCP connection"); |
---|
2202 | |
---|
2203 | /* |
---|
2204 | * Generate a standardized TCP log line for use throughout the |
---|
2205 | * tcp subsystem. Memory allocation is done with M_NOWAIT to |
---|
2206 | * allow use in the interrupt context. |
---|
2207 | * |
---|
2208 | * NB: The caller MUST free(s, M_TCPLOG) the returned string. |
---|
2209 | * NB: The function may return NULL if memory allocation failed. |
---|
2210 | * |
---|
2211 | * Due to header inclusion and ordering limitations the struct ip |
---|
2212 | * and ip6_hdr pointers have to be passed as void pointers. |
---|
2213 | */ |
---|
2214 | char * |
---|
2215 | tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
---|
2216 | const void *ip6hdr) |
---|
2217 | { |
---|
2218 | |
---|
2219 | /* Is logging enabled? */ |
---|
2220 | if (tcp_log_in_vain == 0) |
---|
2221 | return (NULL); |
---|
2222 | |
---|
2223 | return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); |
---|
2224 | } |
---|
2225 | |
---|
2226 | char * |
---|
2227 | tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
---|
2228 | const void *ip6hdr) |
---|
2229 | { |
---|
2230 | |
---|
2231 | /* Is logging enabled? */ |
---|
2232 | if (tcp_log_debug == 0) |
---|
2233 | return (NULL); |
---|
2234 | |
---|
2235 | return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); |
---|
2236 | } |
---|
2237 | |
---|
2238 | static char * |
---|
2239 | tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, |
---|
2240 | const void *ip6hdr) |
---|
2241 | { |
---|
2242 | char *s, *sp; |
---|
2243 | size_t size; |
---|
2244 | struct ip *ip; |
---|
2245 | #ifdef INET6 |
---|
2246 | const struct ip6_hdr *ip6; |
---|
2247 | |
---|
2248 | ip6 = (const struct ip6_hdr *)ip6hdr; |
---|
2249 | #endif /* INET6 */ |
---|
2250 | ip = (struct ip *)ip4hdr; |
---|
2251 | |
---|
2252 | /* |
---|
2253 | * The log line looks like this: |
---|
2254 | * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>" |
---|
2255 | */ |
---|
2256 | size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + |
---|
2257 | sizeof(PRINT_TH_FLAGS) + 1 + |
---|
2258 | #ifdef INET6 |
---|
2259 | 2 * INET6_ADDRSTRLEN; |
---|
2260 | #else |
---|
2261 | 2 * INET_ADDRSTRLEN; |
---|
2262 | #endif /* INET6 */ |
---|
2263 | |
---|
2264 | s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); |
---|
2265 | if (s == NULL) |
---|
2266 | return (NULL); |
---|
2267 | |
---|
2268 | strcat(s, "TCP: ["); |
---|
2269 | sp = s + strlen(s); |
---|
2270 | |
---|
2271 | if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { |
---|
2272 | inet_ntoa_r(inc->inc_faddr, sp); |
---|
2273 | sp = s + strlen(s); |
---|
2274 | sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); |
---|
2275 | sp = s + strlen(s); |
---|
2276 | inet_ntoa_r(inc->inc_laddr, sp); |
---|
2277 | sp = s + strlen(s); |
---|
2278 | sprintf(sp, "]:%i", ntohs(inc->inc_lport)); |
---|
2279 | #ifdef INET6 |
---|
2280 | } else if (inc) { |
---|
2281 | ip6_sprintf(sp, &inc->inc6_faddr); |
---|
2282 | sp = s + strlen(s); |
---|
2283 | sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); |
---|
2284 | sp = s + strlen(s); |
---|
2285 | ip6_sprintf(sp, &inc->inc6_laddr); |
---|
2286 | sp = s + strlen(s); |
---|
2287 | sprintf(sp, "]:%i", ntohs(inc->inc_lport)); |
---|
2288 | } else if (ip6 && th) { |
---|
2289 | ip6_sprintf(sp, &ip6->ip6_src); |
---|
2290 | sp = s + strlen(s); |
---|
2291 | sprintf(sp, "]:%i to [", ntohs(th->th_sport)); |
---|
2292 | sp = s + strlen(s); |
---|
2293 | ip6_sprintf(sp, &ip6->ip6_dst); |
---|
2294 | sp = s + strlen(s); |
---|
2295 | sprintf(sp, "]:%i", ntohs(th->th_dport)); |
---|
2296 | #endif /* INET6 */ |
---|
2297 | } else if (ip && th) { |
---|
2298 | inet_ntoa_r(ip->ip_src, sp); |
---|
2299 | sp = s + strlen(s); |
---|
2300 | sprintf(sp, "]:%i to [", ntohs(th->th_sport)); |
---|
2301 | sp = s + strlen(s); |
---|
2302 | inet_ntoa_r(ip->ip_dst, sp); |
---|
2303 | sp = s + strlen(s); |
---|
2304 | sprintf(sp, "]:%i", ntohs(th->th_dport)); |
---|
2305 | } else { |
---|
2306 | free(s, M_TCPLOG); |
---|
2307 | return (NULL); |
---|
2308 | } |
---|
2309 | sp = s + strlen(s); |
---|
2310 | if (th) |
---|
2311 | sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); |
---|
2312 | if (*(s + size - 1) != '\0') |
---|
2313 | panic("%s: string too long", __func__); |
---|
2314 | return (s); |
---|
2315 | } |
---|