[20180126]内核参数tcp_keepalive.txt
--//linux内核参数tcp_keepalive,用来检查网络链路是否有效.缺省:
# echo /proc/sys/net/ipv4/tcp_keepalive* | xargs -n 1 strings -1 -f
/proc/sys/net/ipv4/tcp_keepalive_intvl: 75
/proc/sys/net/ipv4/tcp_keepalive_probes: 9
/proc/sys/net/ipv4/tcp_keepalive_time: 7200
参数解析:
/proc/sys/net/ipv4/tcp_keepalive_time 当keepalive起用的时候,TCP发送keepalive消息的频度。默认是2小时。
/proc/sys/net/ipv4/tcp_keepalive_intvl 当探测没有确认时,keepalive探测包的发送间隔。缺省是75秒。
/proc/sys/net/ipv4/tcp_keepalive_probes 如果对方不予应答,keepalive探测包的发送次数。缺省值是9。
--//缺省定义的TCP发送keepalive消息的频度,默认是2小时。
--//我以前做过测试,链接:http://blog.itpub.net/267265/viewspace-2138391/
--//当时测试对应oracle的连接,如果设置sqlnet.expire_time参数,内核参数无效.
--//而且我在许多测试提到如果使用sqlnet.expire_time参数,最大间隔是2倍的sqlnet.expire_time,也就是在1个sqlnet.expire_time时间段内,
--//如果有数据包在连接接收发送,监测包不会发送.
--//参考链接:http://blog.itpub.net/267265/viewspace-2150499/
--//而使用linux内核参数tcp_keepalive参数与使用sqlnet.expire_time还是有点点不同:
--//注解sqlnet.expire_time参数,设置内核参数如下:
--//修改/etc/sysctl.conf 加入如下:
net.ipv4.tcp_keepalive_time = 180
net.ipv4.tcp_keepalive_intvl = 10
net.ipv4.tcp_keepalive_probes = 4
# sysctl -p
$ echo /proc/sys/net/ipv4/tcp_keepalive* | xargs -n 1 strings -1 -f
/proc/sys/net/ipv4/tcp_keepalive_intvl: 10
/proc/sys/net/ipv4/tcp_keepalive_probes: 4
/proc/sys/net/ipv4/tcp_keepalive_time: 180
--//打开tcpdump的同时,启动一个新会话(远端).
# tcpdump -i eth0 host 192.168.98.6 and not port 22 -nn -vv
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
09:45:32.852127 IP (tos 0x0, ttl 127, id 1649, offset 0, flags [DF], proto: TCP (6), length: 52) 192.168.98.6.57017 > 192.168.100.78.1521: S, cksum 0x362f (correct), 3423871106:3423871106(0) win 8192 <mss 1460,nop,wscale 2,nop,nop,sackOK>
09:45:32.852176 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto: TCP (6), length: 52) 192.168.100.78.1521 > 192.168.98.6.57017: S, cksum 0x47cc (incorrect (-> 0x464b), 1678537401:1678537401(0) ack 3423871107 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 7>
...
09:45:33.175785 IP (tos 0x0, ttl 64, id 39960, offset 0, flags [DF], proto: TCP (6), length: 57) 192.168.100.78.1521 > 192.168.98.6.57017: P, cksum 0x47d1 (incorrect (-> 0x76cb), 6608:6625(17) ack 7937 win 330
09:45:33.176049 IP (tos 0x0, ttl 127, id 1696, offset 0, flags [DF], proto: TCP (6), length: 52) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x303e (correct), 7937:7937(0) ack 6625 win 16307 <nop,nop,sack 1 {6608:6625}>
--//不执行任何sql语句.
09:48:33.400755 IP (tos 0x0, ttl 64, id 39961, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.100.78.1521 > 192.168.98.6.57017: ., cksum 0x47c0 (incorrect (-> 0x85fc), 6624:6624(0) ack 7937 win 330
09:48:33.401006 IP (tos 0x0, ttl 127, id 5703, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x4792 (correct), 7937:7937(0) ack 6625 win 16307
09:51:33.624757 IP (tos 0x0, ttl 64, id 39962, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.100.78.1521 > 192.168.98.6.57017: ., cksum 0x47c0 (incorrect (-> 0x85fc), 6624:6624(0) ack 7937 win 330
09:51:33.625026 IP (tos 0x0, ttl 127, id 7103, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x4792 (correct), 7937:7937(0) ack 6625 win 16307
09:54:33.848760 IP (tos 0x0, ttl 64, id 39963, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.100.78.1521 > 192.168.98.6.57017: ., cksum 0x47c0 (incorrect (-> 0x85fc), 6624:6624(0) ack 7937 win 330
09:54:33.848986 IP (tos 0x0, ttl 127, id 8121, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x4792 (correct), 7937:7937(0) ack 6625 win 16307
09:57:34.072758 IP (tos 0x0, ttl 64, id 39964, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.100.78.1521 > 192.168.98.6.57017: ., cksum 0x47c0 (incorrect (-> 0x85fc), 6624:6624(0) ack 7937 win 330
09:57:34.072989 IP (tos 0x0, ttl 127, id 9256, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x4792 (correct), 7937:7937(0) ack 6625 win 16307
--//可以发现间隔3分钟.
--//在会话执行如下:
SCOTT@78> select sysdate from Dual ;
SYSDATE
-------------------
2018-01-26 09:57:54
09:57:54.443243 IP (tos 0x0, ttl 127, id 9364, offset 0, flags [DF], proto: TCP (6), length: 303) 192.168.98.6.57017 > 192.168.100.78.1521: P 7937:8200(263) ack 6625 win 16307
09:57:54.445202 IP (tos 0x0, ttl 64, id 39965, offset 0, flags [DF], proto: TCP (6), length: 308) 192.168.100.78.1521 > 192.168.98.6.57017: P 6625:6893(268) ack 8200 win 330
09:57:54.463455 IP (tos 0x0, ttl 127, id 9366, offset 0, flags [DF], proto: TCP (6), length: 61) 192.168.98.6.57017 > 192.168.100.78.1521: P, cksum 0x25c1 (correct), 8200:8221(21) ack 6893 win 16240
09:57:54.463592 IP (tos 0x0, ttl 64, id 39966, offset 0, flags [DF], proto: TCP (6), length: 142) 192.168.100.78.1521 > 192.168.98.6.57017: P 6893:6995(102) ack 8221 win 330
09:57:54.666785 IP (tos 0x0, ttl 64, id 39967, offset 0, flags [DF], proto: TCP (6), length: 142) 192.168.100.78.1521 > 192.168.98.6.57017: P 6893:6995(102) ack 8221 win 330
09:57:54.667035 IP (tos 0x0, ttl 127, id 9368, offset 0, flags [DF], proto: TCP (6), length: 52) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x2b7e (correct), 8221:8221(0) ack 6995 win 16214 <nop,nop,sack 1 {6893:6995}>
--//执行sql语句时的情况.等....
10:00:54.712765 IP (tos 0x0, ttl 64, id 39968, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.100.78.1521 > 192.168.98.6.57017: ., cksum 0x47c0 (incorrect (-> 0x836e), 6994:6994(0) ack 8221 win 330
10:00:54.713009 IP (tos 0x0, ttl 127, id 10523, offset 0, flags [DF], proto: TCP (6), length: 40) 192.168.98.6.57017 > 192.168.100.78.1521: ., cksum 0x4561 (correct), 8221:8221(0) ack 6995 win 16214
--//可以发现间隔3分钟.但是从执行sql语句后算起.而不是从09:57:34.072989.
--//如果对比还是可以看出两者的区别:
--//使用内核参数空闲tcp_keepalive_time发出监测包.
--//而使用sqlnet.expire_time参数,它是从连接开始算起,间隔1个sqlnet.expire_time时间内,如果有数据包发送,就不发送监测包.这样就会出现最大间隔
--//2个sqlnet.expire_time的情况,感觉这东西表达有点困难.大家还是看链接的测试把:http://blog.itpub.net/267265/viewspace-2150499/
--//从某种意义讲我感觉使用内核参数更加科学.而且可以更加科学的控制如果链路出问题时的探测次数以及时间间隔.