diff --git a/changelog/65816.fixed.md b/changelog/65816.fixed.md new file mode 100644 index 00000000000..23aaa1e5e8e --- /dev/null +++ b/changelog/65816.fixed.md @@ -0,0 +1 @@ +Fix for GitFS failure to unlock lock file, and resource cleanup for process SIGTERM diff --git a/salt/utils/gitfs.py b/salt/utils/gitfs.py index eae050eaf4d..6d67f7c241d 100644 --- a/salt/utils/gitfs.py +++ b/salt/utils/gitfs.py @@ -247,9 +247,11 @@ class GitProvider: def _val_cb(x, y): return str(y) + # DGM try getting machine_identifier + # get machine_identifier self.mach_id = salt.utils.platform.get_machine_identifier() - log.debug(f"DGM getting machine_id for lock file, machine_id '{self.mach_id}'") + log.debug(f"machine_id for lock file, machine_id '{self.mach_id}'") self.global_saltenv = salt.utils.data.repack_dictlist( self.opts.get(f"{self.role}_saltenv", []), @@ -754,11 +756,8 @@ class GitProvider: except OSError as exc: if exc.errno == errno.ENOENT: # No lock file present - msg = "Attempt to remove lock {} for file ({}) which was not found to exist : {} ".format( - self.url, lock_file, exc - ) + msg = f"Attempt to remove lock {self.url} for file ({lock_file}) which was not found to exist, exception : {exc} " log.debug(msg) - # DGM pass elif exc.errno == errno.EISDIR: # Somehow this path is a directory. Should never happen @@ -1101,20 +1100,21 @@ class GitProvider: if poll_interval > timeout: poll_interval = timeout - lock_set = False + lock_set1 = False + lock_set2 = False try: time_start = time.time() while True: try: self._lock(lock_type=lock_type, failhard=True) - lock_set = True + lock_set1 = True # docs state need to yield a single value, lock_set will do - yield lock_set + yield lock_set1 # Break out of his loop once we've yielded the lock, to # avoid continued attempts to iterate and establish lock # just ensuring lock_set is true (belts and braces) - lock_set = True + lock_set2 = True break except (OSError, GitLockError) as exc: @@ -1132,7 +1132,9 @@ class GitProvider: time.sleep(poll_interval) continue finally: - if lock_set: + if lock_set1 or lock_set2: + msg = f"Attempting to remove '{lock_type}' lock for '{self.role}' remote '{self.id}' due to lock_set1 '{lock_set1}' or lock_set2 '{lock_set2}'" + log.debug(msg) self.clear_lock(lock_type=lock_type) def init_remote(self): diff --git a/salt/utils/process.py b/salt/utils/process.py index 964b91e08d2..56e75d55ccf 100644 --- a/salt/utils/process.py +++ b/salt/utils/process.py @@ -14,12 +14,14 @@ import multiprocessing import multiprocessing.util import os import queue +import shutil import signal import socket import subprocess import sys import threading import time +from pathlib import Path import salt._logging import salt.defaults.exitcodes @@ -1071,7 +1073,9 @@ class SignalHandlingProcess(Process): log.debug(msg) mach_id = salt.utils.platform.get_machine_identifier() - log.debug(f"DGM exiting for machine identifer '{mach_id}'") + log.debug( + f"exiting for process id '{os.getpid()}' and machine identifer '{mach_id}'" + ) if HAS_PSUTIL: try: @@ -1090,8 +1094,78 @@ class SignalHandlingProcess(Process): os.getpid(), ) - # DGM need to go through and clean up any resources left around like lock files + # need to go through and clean up any resources left around like lock files if using gitfs # example lockfile /var/cache/salt/master/gitfs/work/NlJQs6Pss_07AugikCrmqfmqEFrfPbCDBqGLBiCd3oU=/_/update.lk + gitfs_active = self.opts.get("gitfs_remotes", None) + if gitfs_active: + # check for gitfs file locks to ensure no resource leaks + # last chance to clean up any missed unlock droppings + cache_dir = Path("/var/cache/salt/master/gitfs/work") + if cache_dir.exists and cache_dir.is_dir(): + file_list = list(cache_dir.glob("**/*.lk")) + file_del_list = [] + + try: + for file_name in file_list: + with salt.utils.files.fopen(file_name, "r") as fd_: + try: + file_pid = int( + salt.utils.stringutils.to_unicode( + fd_.readline() + ).rstrip() + ) + except ValueError: + # Lock file is empty, set pid to 0 so it evaluates as False. + file_pid = 0 + try: + file_mach_id = int( + salt.utils.stringutils.to_unicode( + fd_.readline() + ).rstrip() + ) + except ValueError: + # Lock file is empty, set mach_id to 0 so it evaluates as False. + file_mach_id = 0 + if cur_pid == file_pid: + if mach_id != file_mach_id: + if not file_mach_id: + msg = f"gitfs lock file for pid '{file_pid}' does not contain a machine id, deleting lock file which may affect if using multi-master with shared gitfs cache, the lock may have been obtained by another master recommend updating Salt version on other masters to a version which insert machine identification in lock a file." + log.debug(msg) + file_del_list.append( + (file_name, file_pid, file_mach_id) + ) + else: + file_del_list.append( + (file_name, file_pid, file_mach_id) + ) + + except FileNotFoundError: + log.debug("gitfs lock file: %s not found", file_name) + + for (file_name, file_pid, file_mach_id) in file_del_list: + try: + os.remove(file_name) + except OSError as exc: + if exc.errno == errno.ENOENT: + # No lock file present + msg = f"SIGTERM clean up of resources attempted to remove lock file {file_name}, pid '{file_pid}', machine identifier '{mach_id}' but it did not exist, exception : {exc} " + log.debug(msg) + + elif exc.errno == errno.EISDIR: + # Somehow this path is a directory. Should never happen + # unless some wiseguy manually creates a directory at this + # path, but just in case, handle it. + try: + shutil.rmtree(file_name) + except OSError as exc: + msg = f"SIGTERM clean up of resources, lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}' was a directory, removed directory, exception : '{exc}'" + log.debug(msg) + else: + msg = f"SIGTERM clean up of resources, unable to remove lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}', exception : '{exc}'" + log.debug(msg) + else: + msg = f"SIGTERM clean up of resources, removed lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}'" + log.debug(msg) except psutil.NoSuchProcess: log.warning(