Minions check dns when re-connecting to a master

Check for a chainging dns record anytime a minion gets disconnected from it's master. See github issue #63654 #61482.
2025-04-17 10:10:20 +00:00 · 2024-04-23 03:28:43 -07:00 · 2024-04-23 03:28:43 -07:00 · d2c59f45b6
commit d2c59f45b6
parent b1754eeb63
1 changed files with 75 additions and 105 deletions
--- a/salt/minion.py
+++ b/salt/minion.py
@ -2826,9 +2826,60 @@ class Minion(MinionBase):
                # we are not connected anymore
                self.connected = False
                log.info("Connection to master %s lost", self.opts["master"])
                if self.opts["transport"] != "tcp":
                    self.schedule.delete_job(name=master_event(type="alive"))
-                if self.opts["master_type"] != "failover":
+                log.info("Trying to tune in to next master from master-list")
-                    # modify the scheduled job to fire on reconnect
+
                if hasattr(self, "pub_channel"):
                    self.pub_channel.on_recv(None)
                    if hasattr(self.pub_channel, "auth"):
                        self.pub_channel.auth.invalidate()
                    if hasattr(self.pub_channel, "close"):
                        self.pub_channel.close()
                if hasattr(self, "req_channel") and self.req_channel:
                    self.req_channel.close()
                    self.req_channel = None
                # if eval_master finds a new master for us, self.connected
                # will be True again on successful master authentication
                try:
                    master, self.pub_channel = yield self.eval_master(
                        opts=self.opts,
                        failed=True,
                        failback=tag.startswith(master_event(type="failback")),
                    )
                except SaltClientError:
                    pass
                if self.connected:
                    self.opts["master"] = master
                    # re-init the subsystems to work with the new master
                    log.info(
                        "Re-initialising subsystems for new master %s",
                        self.opts["master"],
                    )
                    self.req_channel = salt.channel.client.AsyncReqChannel.factory(
                        self.opts, io_loop=self.io_loop
                    )
                    # put the current schedule into the new loaders
                    self.opts["schedule"] = self.schedule.option("schedule")
                    (
                        self.functions,
                        self.returners,
                        self.function_errors,
                        self.executors,
                    ) = self._load_modules()
                    # make the schedule to use the new 'functions' loader
                    self.schedule.functions = self.functions
                    self.pub_channel.on_recv(self._handle_payload)
                    self._fire_master_minion_start()
                    log.info("Minion is ready to receive requests!")
                    # update scheduled job to run with the new master addr
                    if self.opts["transport"] != "tcp":
                        schedule = {
                            "function": "status.master",
@ -2838,116 +2889,35 @@ class Minion(MinionBase):
                            "return_job": False,
                            "kwargs": {
                                "master": self.opts["master"],
-                                "connected": False,
+                                "connected": True,
                            },
                        }
                        self.schedule.modify_job(
                            name=master_event(type="alive", master=self.opts["master"]),
                            schedule=schedule,
                        )
                        if self.opts["master_failback"] and "master_list" in self.opts:
                            if self.opts["master"] != self.opts["master_list"][0]:
                                schedule = {
                                    "function": "status.ping_master",
                                    "seconds": self.opts["master_failback_interval"],
                                    "jid_include": True,
                                    "maxrunning": 1,
                                    "return_job": False,
                                    "kwargs": {"master": self.opts["master_list"][0]},
                                }
                                self.schedule.modify_job(
                                    name=master_event(type="failback"),
                                    schedule=schedule,
                                )
                            else:
                                self.schedule.delete_job(
                                    name=master_event(type="failback"), persist=True
                                )
                else:
-                    # delete the scheduled job to don't interfere with the failover process
+                    self.restart = True
-                    if self.opts["transport"] != "tcp":
+                    self.io_loop.stop()
                        self.schedule.delete_job(name=master_event(type="alive"))
                    log.info("Trying to tune in to next master from master-list")
                    if hasattr(self, "pub_channel"):
                        self.pub_channel.on_recv(None)
                        if hasattr(self.pub_channel, "auth"):
                            self.pub_channel.auth.invalidate()
                        if hasattr(self.pub_channel, "close"):
                            self.pub_channel.close()
                    if hasattr(self, "req_channel") and self.req_channel:
                        self.req_channel.close()
                        self.req_channel = None
                    # if eval_master finds a new master for us, self.connected
                    # will be True again on successful master authentication
                    try:
                        master, self.pub_channel = yield self.eval_master(
                            opts=self.opts,
                            failed=True,
                            failback=tag.startswith(master_event(type="failback")),
                        )
                    except SaltClientError:
                        pass
                    if self.connected:
                        self.opts["master"] = master
                        # re-init the subsystems to work with the new master
                        log.info(
                            "Re-initialising subsystems for new master %s",
                            self.opts["master"],
                        )
                        self.req_channel = salt.channel.client.AsyncReqChannel.factory(
                            self.opts, io_loop=self.io_loop
                        )
                        # put the current schedule into the new loaders
                        self.opts["schedule"] = self.schedule.option("schedule")
                        (
                            self.functions,
                            self.returners,
                            self.function_errors,
                            self.executors,
                        ) = self._load_modules()
                        # make the schedule to use the new 'functions' loader
                        self.schedule.functions = self.functions
                        self.pub_channel.on_recv(self._handle_payload)
                        self._fire_master_minion_start()
                        log.info("Minion is ready to receive requests!")
                        # update scheduled job to run with the new master addr
                        if self.opts["transport"] != "tcp":
                            schedule = {
                                "function": "status.master",
                                "seconds": self.opts["master_alive_interval"],
                                "jid_include": True,
                                "maxrunning": 1,
                                "return_job": False,
                                "kwargs": {
                                    "master": self.opts["master"],
                                    "connected": True,
                                },
                            }
                            self.schedule.modify_job(
                                name=master_event(
                                    type="alive", master=self.opts["master"]
                                ),
                                schedule=schedule,
                            )
                            if (
                                self.opts["master_failback"]
                                and "master_list" in self.opts
                            ):
                                if self.opts["master"] != self.opts["master_list"][0]:
                                    schedule = {
                                        "function": "status.ping_master",
                                        "seconds": self.opts[
                                            "master_failback_interval"
                                        ],
                                        "jid_include": True,
                                        "maxrunning": 1,
                                        "return_job": False,
                                        "kwargs": {
                                            "master": self.opts["master_list"][0]
                                        },
                                    }
                                    self.schedule.modify_job(
                                        name=master_event(type="failback"),
                                        schedule=schedule,
                                    )
                                else:
                                    self.schedule.delete_job(
                                        name=master_event(type="failback"), persist=True
                                    )
                    else:
                        self.restart = True
                        self.io_loop.stop()
        elif tag.startswith(master_event(type="connected")):
            # handle this event only once. otherwise it will pollute the log