Merge pull request #36126 from edlane/develop

Add keepalives to salt-master created connections for better presence
This commit is contained in:
Mike Place 2016-09-09 18:50:51 +09:00 committed by GitHub
commit 8f609a5799
2 changed files with 66 additions and 26 deletions

View file

@ -1024,3 +1024,35 @@
#event_match_type: startswith
# Save runner returns to the job cache
#runner_returns: True
###### Keepalive settings ######
############################################
# Warning: Failure to set TCP keepalives on the salt-master can result in
# not detecting the loss of a minion when the connection is lost or when
# it's host has been terminated without first closing the socket.
# Salt's Presence System depends on this connection status to know if a minion
# is "present".
# ZeroMQ now includes support for configuring SO_KEEPALIVE if supported by
# the OS. If connections between the minion and the master pass through
# a state tracking device such as a firewall or VPN gateway, there is
# the risk that it could tear down the connection the master and minion
# without informing either party that their connection has been taken away.
# Enabling TCP Keepalives prevents this from happening.
# Overall state of TCP Keepalives, enable (1 or True), disable (0 or False)
# or leave to the OS defaults (-1), on Linux, typically disabled. Default True, enabled.
#tcp_keepalive: True
# How long before the first keepalive should be sent in seconds. Default 300
# to send the first keepalive after 5 minutes, OS default (-1) is typically 7200 seconds
# on Linux see /proc/sys/net/ipv4/tcp_keepalive_time.
#tcp_keepalive_idle: 300
# How many lost probes are needed to consider the connection lost. Default -1
# to use OS defaults, typically 9 on Linux, see /proc/sys/net/ipv4/tcp_keepalive_probes.
#tcp_keepalive_cnt: -1
# How often, in seconds, to send keepalives after the first one. Default -1 to
# use OS defaults, typically 75 seconds on Linux, see
# /proc/sys/net/ipv4/tcp_keepalive_intvl.
#tcp_keepalive_intvl: -1

View file

@ -649,6 +649,38 @@ class ZeroMQReqServerChannel(salt.transport.mixins.auth.AESReqServerMixin, salt.
sys.exit(salt.defaults.exitcodes.EX_OK)
def _set_tcp_keepalive(zmq_socket, opts):
'''
Ensure that TCP keepalives are set as specified in "opts".
Warning: Failure to set TCP keepalives on the salt-master can result in
not detecting the loss of a minion when the connection is lost or when
it's host has been terminated without first closing the socket.
Salt's Presence System depends on this connection status to know if a minion
is "present".
Warning: Failure to set TCP keepalives on minions can result in frequent or
unexpected disconnects!
'''
if hasattr(zmq, 'TCP_KEEPALIVE') and opts:
if 'tcp_keepalive' in opts:
zmq_socket.setsockopt(
zmq.TCP_KEEPALIVE, opts['tcp_keepalive']
)
if 'tcp_keepalive_idle' in opts:
zmq_socket.setsockopt(
zmq.TCP_KEEPALIVE_IDLE, opts['tcp_keepalive_idle']
)
if 'tcp_keepalive_cnt' in opts:
zmq_socket.setsockopt(
zmq.TCP_KEEPALIVE_CNT, opts['tcp_keepalive_cnt']
)
if 'tcp_keepalive_intvl' in opts:
zmq_socket.setsockopt(
zmq.TCP_KEEPALIVE_INTVL, opts['tcp_keepalive_intvl']
)
class ZeroMQPubServerChannel(salt.transport.server.PubServerChannel):
'''
Encapsulate synchronous operations for a publisher channel
@ -670,6 +702,7 @@ class ZeroMQPubServerChannel(salt.transport.server.PubServerChannel):
context = zmq.Context(1)
# Prepare minion publish socket
pub_sock = context.socket(zmq.PUB)
_set_tcp_keepalive(pub_sock, self.opts)
# if 2.1 >= zmq < 3.0, we only have one HWM setting
try:
pub_sock.setsockopt(zmq.HWM, self.opts.get('pub_hwm', 1000))
@ -882,7 +915,7 @@ class AsyncReqMessageClient(object):
zmq.RECONNECT_IVL_MAX, 5000
)
self._set_tcp_keepalive()
_set_tcp_keepalive(self.socket, self.opts)
if self.addr.startswith('tcp://['):
# Hint PF type if bracket enclosed IPv6 address
if hasattr(zmq, 'IPV6'):
@ -893,31 +926,6 @@ class AsyncReqMessageClient(object):
self.socket.connect(self.addr)
self.stream = zmq.eventloop.zmqstream.ZMQStream(self.socket, io_loop=self.io_loop)
def _set_tcp_keepalive(self):
'''
Ensure that TCP keepalives are set for the ReqServer.
Warning: Failure to set TCP keepalives can result in frequent or unexpected
disconnects!
'''
if hasattr(zmq, 'TCP_KEEPALIVE') and self.opts:
if 'tcp_keepalive' in self.opts:
self.socket.setsockopt(
zmq.TCP_KEEPALIVE, self.opts['tcp_keepalive']
)
if 'tcp_keepalive_idle' in self.opts:
self.socket.setsockopt(
zmq.TCP_KEEPALIVE_IDLE, self.opts['tcp_keepalive_idle']
)
if 'tcp_keepalive_cnt' in self.opts:
self.socket.setsockopt(
zmq.TCP_KEEPALIVE_CNT, self.opts['tcp_keepalive_cnt']
)
if 'tcp_keepalive_intvl' in self.opts:
self.socket.setsockopt(
zmq.TCP_KEEPALIVE_INTVL, self.opts['tcp_keepalive_intvl']
)
@tornado.gen.coroutine
def _internal_send_recv(self):
while len(self.send_queue) > 0: