Fix for GitFS failure to unlock lock file, and resource cleanup for process SIGTERM, tests to be done

This commit is contained in:
David Murphy 2024-02-01 13:37:29 -07:00 committed by Daniel Wozniak
parent 1290f93d06
commit d58a6897e6
3 changed files with 89 additions and 12 deletions

1
changelog/65816.fixed.md Normal file
View file

@ -0,0 +1 @@
Fix for GitFS failure to unlock lock file, and resource cleanup for process SIGTERM

View file

@ -247,9 +247,11 @@ class GitProvider:
def _val_cb(x, y): def _val_cb(x, y):
return str(y) return str(y)
# DGM try getting machine_identifier # DGM try getting machine_identifier
# get machine_identifier
self.mach_id = salt.utils.platform.get_machine_identifier() self.mach_id = salt.utils.platform.get_machine_identifier()
log.debug(f"DGM getting machine_id for lock file, machine_id '{self.mach_id}'") log.debug(f"machine_id for lock file, machine_id '{self.mach_id}'")
self.global_saltenv = salt.utils.data.repack_dictlist( self.global_saltenv = salt.utils.data.repack_dictlist(
self.opts.get(f"{self.role}_saltenv", []), self.opts.get(f"{self.role}_saltenv", []),
@ -754,11 +756,8 @@ class GitProvider:
except OSError as exc: except OSError as exc:
if exc.errno == errno.ENOENT: if exc.errno == errno.ENOENT:
# No lock file present # No lock file present
msg = "Attempt to remove lock {} for file ({}) which was not found to exist : {} ".format( msg = f"Attempt to remove lock {self.url} for file ({lock_file}) which was not found to exist, exception : {exc} "
self.url, lock_file, exc
)
log.debug(msg) log.debug(msg)
# DGM pass
elif exc.errno == errno.EISDIR: elif exc.errno == errno.EISDIR:
# Somehow this path is a directory. Should never happen # Somehow this path is a directory. Should never happen
@ -1101,20 +1100,21 @@ class GitProvider:
if poll_interval > timeout: if poll_interval > timeout:
poll_interval = timeout poll_interval = timeout
lock_set = False lock_set1 = False
lock_set2 = False
try: try:
time_start = time.time() time_start = time.time()
while True: while True:
try: try:
self._lock(lock_type=lock_type, failhard=True) self._lock(lock_type=lock_type, failhard=True)
lock_set = True lock_set1 = True
# docs state need to yield a single value, lock_set will do # docs state need to yield a single value, lock_set will do
yield lock_set yield lock_set1
# Break out of his loop once we've yielded the lock, to # Break out of his loop once we've yielded the lock, to
# avoid continued attempts to iterate and establish lock # avoid continued attempts to iterate and establish lock
# just ensuring lock_set is true (belts and braces) # just ensuring lock_set is true (belts and braces)
lock_set = True lock_set2 = True
break break
except (OSError, GitLockError) as exc: except (OSError, GitLockError) as exc:
@ -1132,7 +1132,9 @@ class GitProvider:
time.sleep(poll_interval) time.sleep(poll_interval)
continue continue
finally: finally:
if lock_set: if lock_set1 or lock_set2:
msg = f"Attempting to remove '{lock_type}' lock for '{self.role}' remote '{self.id}' due to lock_set1 '{lock_set1}' or lock_set2 '{lock_set2}'"
log.debug(msg)
self.clear_lock(lock_type=lock_type) self.clear_lock(lock_type=lock_type)
def init_remote(self): def init_remote(self):

View file

@ -14,12 +14,14 @@ import multiprocessing
import multiprocessing.util import multiprocessing.util
import os import os
import queue import queue
import shutil
import signal import signal
import socket import socket
import subprocess import subprocess
import sys import sys
import threading import threading
import time import time
from pathlib import Path
import salt._logging import salt._logging
import salt.defaults.exitcodes import salt.defaults.exitcodes
@ -1071,7 +1073,9 @@ class SignalHandlingProcess(Process):
log.debug(msg) log.debug(msg)
mach_id = salt.utils.platform.get_machine_identifier() mach_id = salt.utils.platform.get_machine_identifier()
log.debug(f"DGM exiting for machine identifer '{mach_id}'") log.debug(
f"exiting for process id '{os.getpid()}' and machine identifer '{mach_id}'"
)
if HAS_PSUTIL: if HAS_PSUTIL:
try: try:
@ -1090,8 +1094,78 @@ class SignalHandlingProcess(Process):
os.getpid(), os.getpid(),
) )
# DGM need to go through and clean up any resources left around like lock files # need to go through and clean up any resources left around like lock files if using gitfs
# example lockfile /var/cache/salt/master/gitfs/work/NlJQs6Pss_07AugikCrmqfmqEFrfPbCDBqGLBiCd3oU=/_/update.lk # example lockfile /var/cache/salt/master/gitfs/work/NlJQs6Pss_07AugikCrmqfmqEFrfPbCDBqGLBiCd3oU=/_/update.lk
gitfs_active = self.opts.get("gitfs_remotes", None)
if gitfs_active:
# check for gitfs file locks to ensure no resource leaks
# last chance to clean up any missed unlock droppings
cache_dir = Path("/var/cache/salt/master/gitfs/work")
if cache_dir.exists and cache_dir.is_dir():
file_list = list(cache_dir.glob("**/*.lk"))
file_del_list = []
try:
for file_name in file_list:
with salt.utils.files.fopen(file_name, "r") as fd_:
try:
file_pid = int(
salt.utils.stringutils.to_unicode(
fd_.readline()
).rstrip()
)
except ValueError:
# Lock file is empty, set pid to 0 so it evaluates as False.
file_pid = 0
try:
file_mach_id = int(
salt.utils.stringutils.to_unicode(
fd_.readline()
).rstrip()
)
except ValueError:
# Lock file is empty, set mach_id to 0 so it evaluates as False.
file_mach_id = 0
if cur_pid == file_pid:
if mach_id != file_mach_id:
if not file_mach_id:
msg = f"gitfs lock file for pid '{file_pid}' does not contain a machine id, deleting lock file which may affect if using multi-master with shared gitfs cache, the lock may have been obtained by another master recommend updating Salt version on other masters to a version which insert machine identification in lock a file."
log.debug(msg)
file_del_list.append(
(file_name, file_pid, file_mach_id)
)
else:
file_del_list.append(
(file_name, file_pid, file_mach_id)
)
except FileNotFoundError:
log.debug("gitfs lock file: %s not found", file_name)
for (file_name, file_pid, file_mach_id) in file_del_list:
try:
os.remove(file_name)
except OSError as exc:
if exc.errno == errno.ENOENT:
# No lock file present
msg = f"SIGTERM clean up of resources attempted to remove lock file {file_name}, pid '{file_pid}', machine identifier '{mach_id}' but it did not exist, exception : {exc} "
log.debug(msg)
elif exc.errno == errno.EISDIR:
# Somehow this path is a directory. Should never happen
# unless some wiseguy manually creates a directory at this
# path, but just in case, handle it.
try:
shutil.rmtree(file_name)
except OSError as exc:
msg = f"SIGTERM clean up of resources, lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}' was a directory, removed directory, exception : '{exc}'"
log.debug(msg)
else:
msg = f"SIGTERM clean up of resources, unable to remove lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}', exception : '{exc}'"
log.debug(msg)
else:
msg = f"SIGTERM clean up of resources, removed lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}'"
log.debug(msg)
except psutil.NoSuchProcess: except psutil.NoSuchProcess:
log.warning( log.warning(