Fix for GitFS failure to unlock lock file, and resource cleanup for process SIGTERM, tests to be done

This commit is contained in:
David Murphy 2024-02-01 13:37:29 -07:00 committed by Daniel Wozniak
parent 1290f93d06
commit d58a6897e6
3 changed files with 89 additions and 12 deletions

1
changelog/65816.fixed.md Normal file
View file

@ -0,0 +1 @@
Fix for GitFS failure to unlock lock file, and resource cleanup for process SIGTERM

View file

@ -247,9 +247,11 @@ class GitProvider:
def _val_cb(x, y):
return str(y)
# DGM try getting machine_identifier
# get machine_identifier
self.mach_id = salt.utils.platform.get_machine_identifier()
log.debug(f"DGM getting machine_id for lock file, machine_id '{self.mach_id}'")
log.debug(f"machine_id for lock file, machine_id '{self.mach_id}'")
self.global_saltenv = salt.utils.data.repack_dictlist(
self.opts.get(f"{self.role}_saltenv", []),
@ -754,11 +756,8 @@ class GitProvider:
except OSError as exc:
if exc.errno == errno.ENOENT:
# No lock file present
msg = "Attempt to remove lock {} for file ({}) which was not found to exist : {} ".format(
self.url, lock_file, exc
)
msg = f"Attempt to remove lock {self.url} for file ({lock_file}) which was not found to exist, exception : {exc} "
log.debug(msg)
# DGM pass
elif exc.errno == errno.EISDIR:
# Somehow this path is a directory. Should never happen
@ -1101,20 +1100,21 @@ class GitProvider:
if poll_interval > timeout:
poll_interval = timeout
lock_set = False
lock_set1 = False
lock_set2 = False
try:
time_start = time.time()
while True:
try:
self._lock(lock_type=lock_type, failhard=True)
lock_set = True
lock_set1 = True
# docs state need to yield a single value, lock_set will do
yield lock_set
yield lock_set1
# Break out of his loop once we've yielded the lock, to
# avoid continued attempts to iterate and establish lock
# just ensuring lock_set is true (belts and braces)
lock_set = True
lock_set2 = True
break
except (OSError, GitLockError) as exc:
@ -1132,7 +1132,9 @@ class GitProvider:
time.sleep(poll_interval)
continue
finally:
if lock_set:
if lock_set1 or lock_set2:
msg = f"Attempting to remove '{lock_type}' lock for '{self.role}' remote '{self.id}' due to lock_set1 '{lock_set1}' or lock_set2 '{lock_set2}'"
log.debug(msg)
self.clear_lock(lock_type=lock_type)
def init_remote(self):

View file

@ -14,12 +14,14 @@ import multiprocessing
import multiprocessing.util
import os
import queue
import shutil
import signal
import socket
import subprocess
import sys
import threading
import time
from pathlib import Path
import salt._logging
import salt.defaults.exitcodes
@ -1071,7 +1073,9 @@ class SignalHandlingProcess(Process):
log.debug(msg)
mach_id = salt.utils.platform.get_machine_identifier()
log.debug(f"DGM exiting for machine identifer '{mach_id}'")
log.debug(
f"exiting for process id '{os.getpid()}' and machine identifer '{mach_id}'"
)
if HAS_PSUTIL:
try:
@ -1090,8 +1094,78 @@ class SignalHandlingProcess(Process):
os.getpid(),
)
# DGM need to go through and clean up any resources left around like lock files
# need to go through and clean up any resources left around like lock files if using gitfs
# example lockfile /var/cache/salt/master/gitfs/work/NlJQs6Pss_07AugikCrmqfmqEFrfPbCDBqGLBiCd3oU=/_/update.lk
gitfs_active = self.opts.get("gitfs_remotes", None)
if gitfs_active:
# check for gitfs file locks to ensure no resource leaks
# last chance to clean up any missed unlock droppings
cache_dir = Path("/var/cache/salt/master/gitfs/work")
if cache_dir.exists and cache_dir.is_dir():
file_list = list(cache_dir.glob("**/*.lk"))
file_del_list = []
try:
for file_name in file_list:
with salt.utils.files.fopen(file_name, "r") as fd_:
try:
file_pid = int(
salt.utils.stringutils.to_unicode(
fd_.readline()
).rstrip()
)
except ValueError:
# Lock file is empty, set pid to 0 so it evaluates as False.
file_pid = 0
try:
file_mach_id = int(
salt.utils.stringutils.to_unicode(
fd_.readline()
).rstrip()
)
except ValueError:
# Lock file is empty, set mach_id to 0 so it evaluates as False.
file_mach_id = 0
if cur_pid == file_pid:
if mach_id != file_mach_id:
if not file_mach_id:
msg = f"gitfs lock file for pid '{file_pid}' does not contain a machine id, deleting lock file which may affect if using multi-master with shared gitfs cache, the lock may have been obtained by another master recommend updating Salt version on other masters to a version which insert machine identification in lock a file."
log.debug(msg)
file_del_list.append(
(file_name, file_pid, file_mach_id)
)
else:
file_del_list.append(
(file_name, file_pid, file_mach_id)
)
except FileNotFoundError:
log.debug("gitfs lock file: %s not found", file_name)
for (file_name, file_pid, file_mach_id) in file_del_list:
try:
os.remove(file_name)
except OSError as exc:
if exc.errno == errno.ENOENT:
# No lock file present
msg = f"SIGTERM clean up of resources attempted to remove lock file {file_name}, pid '{file_pid}', machine identifier '{mach_id}' but it did not exist, exception : {exc} "
log.debug(msg)
elif exc.errno == errno.EISDIR:
# Somehow this path is a directory. Should never happen
# unless some wiseguy manually creates a directory at this
# path, but just in case, handle it.
try:
shutil.rmtree(file_name)
except OSError as exc:
msg = f"SIGTERM clean up of resources, lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}' was a directory, removed directory, exception : '{exc}'"
log.debug(msg)
else:
msg = f"SIGTERM clean up of resources, unable to remove lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}', exception : '{exc}'"
log.debug(msg)
else:
msg = f"SIGTERM clean up of resources, removed lock file '{file_name}', pid '{file_pid}', machine identifier '{file_mach_id}'"
log.debug(msg)
except psutil.NoSuchProcess:
log.warning(