mirror of
https://github.com/saltstack/salt.git
synced 2025-04-16 09:40:20 +00:00
WIP - Handling GitFS locking issue and resource loss due to SIGTERM not clearing up
This commit is contained in:
parent
99e158f429
commit
1290f93d06
3 changed files with 91 additions and 10 deletions
|
@ -247,6 +247,9 @@ class GitProvider:
|
|||
|
||||
def _val_cb(x, y):
|
||||
return str(y)
|
||||
# DGM try getting machine_identifier
|
||||
self.mach_id = salt.utils.platform.get_machine_identifier()
|
||||
log.debug(f"DGM getting machine_id for lock file, machine_id '{self.mach_id}'")
|
||||
|
||||
self.global_saltenv = salt.utils.data.repack_dictlist(
|
||||
self.opts.get(f"{self.role}_saltenv", []),
|
||||
|
@ -751,7 +754,12 @@ class GitProvider:
|
|||
except OSError as exc:
|
||||
if exc.errno == errno.ENOENT:
|
||||
# No lock file present
|
||||
pass
|
||||
msg = "Attempt to remove lock {} for file ({}) which was not found to exist : {} ".format(
|
||||
self.url, lock_file, exc
|
||||
)
|
||||
log.debug(msg)
|
||||
# DGM pass
|
||||
|
||||
elif exc.errno == errno.EISDIR:
|
||||
# Somehow this path is a directory. Should never happen
|
||||
# unless some wiseguy manually creates a directory at this
|
||||
|
@ -903,6 +911,16 @@ class GitProvider:
|
|||
self._get_lock_file(lock_type="update"),
|
||||
self.role,
|
||||
)
|
||||
else:
|
||||
log.warning(
|
||||
"Update lock file generated an unexpected exception for %s remote '%s', "
|
||||
"The lock file %s for %s type=update operation, exception: %s .",
|
||||
self.role,
|
||||
self.id,
|
||||
self._get_lock_file(lock_type="update"),
|
||||
self.role,
|
||||
str(exc),
|
||||
)
|
||||
return False
|
||||
|
||||
def _lock(self, lock_type="update", failhard=False):
|
||||
|
@ -930,6 +948,9 @@ class GitProvider:
|
|||
with os.fdopen(fh_, "wb"):
|
||||
# Write the lock file and close the filehandle
|
||||
os.write(fh_, salt.utils.stringutils.to_bytes(str(os.getpid())))
|
||||
os.write(fh_, salt.utils.stringutils.to_bytes("\n"))
|
||||
os.write(fh_, salt.utils.stringutils.to_bytes(str(self.mach_id)))
|
||||
|
||||
except OSError as exc:
|
||||
if exc.errno == errno.EEXIST:
|
||||
with salt.utils.files.fopen(self._get_lock_file(lock_type), "r") as fd_:
|
||||
|
@ -937,44 +958,61 @@ class GitProvider:
|
|||
pid = int(
|
||||
salt.utils.stringutils.to_unicode(fd_.readline()).rstrip()
|
||||
)
|
||||
mach_id = int(
|
||||
salt.utils.stringutils.to_unicode(fd_.readline()).rstrip()
|
||||
)
|
||||
except ValueError:
|
||||
# Lock file is empty, set pid to 0 so it evaluates as
|
||||
# False.
|
||||
pid = 0
|
||||
mach_id = 0
|
||||
global_lock_key = self.role + "_global_lock"
|
||||
lock_file = self._get_lock_file(lock_type=lock_type)
|
||||
if self.opts[global_lock_key]:
|
||||
msg = (
|
||||
"{} is enabled and {} lockfile {} is present for "
|
||||
"{} remote '{}'.".format(
|
||||
"{} remote '{}' on machine_id {}.".format(
|
||||
global_lock_key,
|
||||
lock_type,
|
||||
lock_file,
|
||||
self.role,
|
||||
self.id,
|
||||
self.mach_id,
|
||||
)
|
||||
)
|
||||
if pid:
|
||||
msg += f" Process {pid} obtained the lock"
|
||||
if self.mach_id or mach_id:
|
||||
msg += " Process {} obtained the lock for machine_id {}, current machine_id {}".format(
|
||||
pid, mach_id, self.mach_id
|
||||
)
|
||||
else:
|
||||
msg += " Process {} obtained the lock".format(pid)
|
||||
|
||||
if not pid_exists(pid):
|
||||
msg += (
|
||||
" but this process is not running. The "
|
||||
"update may have been interrupted. If "
|
||||
"using multi-master with shared gitfs "
|
||||
"cache, the lock may have been obtained "
|
||||
"by another master."
|
||||
"by another master"
|
||||
)
|
||||
if self.mach_id != mach_id:
|
||||
msg += ", with machine_id {}".format(mach_id)
|
||||
else:
|
||||
msg += "."
|
||||
log.warning(msg)
|
||||
if failhard:
|
||||
raise
|
||||
return
|
||||
elif pid and pid_exists(pid):
|
||||
log.warning(
|
||||
"Process %d has a %s %s lock (%s)",
|
||||
"Process %d has a %s %s lock (%s) on machine_id %s",
|
||||
pid,
|
||||
self.role,
|
||||
lock_type,
|
||||
lock_file,
|
||||
self.mach_id,
|
||||
)
|
||||
if failhard:
|
||||
raise
|
||||
|
@ -982,12 +1020,13 @@ class GitProvider:
|
|||
else:
|
||||
if pid:
|
||||
log.warning(
|
||||
"Process %d has a %s %s lock (%s), but this "
|
||||
"Process %d has a %s %s lock (%s) on machine_id %s, but this "
|
||||
"process is not running. Cleaning up lock file.",
|
||||
pid,
|
||||
self.role,
|
||||
lock_type,
|
||||
lock_file,
|
||||
self.mach_id,
|
||||
)
|
||||
success, fail = self._clear_lock()
|
||||
if success:
|
||||
|
@ -996,12 +1035,17 @@ class GitProvider:
|
|||
raise
|
||||
return
|
||||
else:
|
||||
msg = "Unable to set {} lock for {} ({}): {} ".format(
|
||||
lock_type, self.id, self._get_lock_file(lock_type), exc
|
||||
msg = "Unable to set {} lock for {} ({}) on machine_id {}: {} ".format(
|
||||
lock_type,
|
||||
self.id,
|
||||
self._get_lock_file(lock_type),
|
||||
self.mach_id,
|
||||
exc,
|
||||
)
|
||||
log.error(msg, exc_info=True)
|
||||
raise GitLockError(exc.errno, msg)
|
||||
msg = f"Set {lock_type} lock for {self.role} remote '{self.id}'"
|
||||
|
||||
msg = f"Set {lock_type} lock for {self.role} remote {self.id} on machine_id '{self.mach_id}'"
|
||||
log.debug(msg)
|
||||
return msg
|
||||
|
||||
|
@ -1018,6 +1062,15 @@ class GitProvider:
|
|||
try:
|
||||
result = self._lock(lock_type="update")
|
||||
except GitLockError as exc:
|
||||
log.warning(
|
||||
"Update lock file generated an unexpected exception for %s remote '%s', "
|
||||
"The lock file %s for %s type=update operation, exception: %s .",
|
||||
self.role,
|
||||
self.id,
|
||||
self._get_lock_file(lock_type="update"),
|
||||
self.role,
|
||||
str(exc),
|
||||
)
|
||||
failed.append(exc.strerror)
|
||||
else:
|
||||
if result is not None:
|
||||
|
@ -1055,10 +1108,15 @@ class GitProvider:
|
|||
try:
|
||||
self._lock(lock_type=lock_type, failhard=True)
|
||||
lock_set = True
|
||||
yield
|
||||
# docs state need to yield a single value, lock_set will do
|
||||
yield lock_set
|
||||
|
||||
# Break out of his loop once we've yielded the lock, to
|
||||
# avoid continued attempts to iterate and establish lock
|
||||
# just ensuring lock_set is true (belts and braces)
|
||||
lock_set = True
|
||||
break
|
||||
|
||||
except (OSError, GitLockError) as exc:
|
||||
if not timeout or time.time() - time_start > timeout:
|
||||
raise GitLockError(exc.errno, exc.strerror)
|
||||
|
|
|
@ -12,6 +12,7 @@ import sys
|
|||
import distro
|
||||
|
||||
from salt.utils.decorators import memoize as real_memoize
|
||||
from salt.utils.files import fopen as _fopen
|
||||
|
||||
|
||||
def linux_distribution(full_distribution_name=True):
|
||||
|
@ -239,3 +240,16 @@ def spawning_platform():
|
|||
Salt, however, will force macOS to spawning by default on all python versions
|
||||
"""
|
||||
return multiprocessing.get_start_method(allow_none=False) == "spawn"
|
||||
|
||||
|
||||
def get_machine_identifier():
|
||||
"""
|
||||
Provide the machine-identifier for machine/virtualization combination
|
||||
"""
|
||||
locations = ["/etc/machine-id", "/var/lib/dbus/machine-id"]
|
||||
existing_locations = [loc for loc in locations if os.path.exists(loc)]
|
||||
if not existing_locations:
|
||||
return ""
|
||||
else:
|
||||
with _fopen(existing_locations[0]) as machineid:
|
||||
return machineid.read().strip()
|
||||
|
|
|
@ -1069,9 +1069,14 @@ class SignalHandlingProcess(Process):
|
|||
msg += "SIGTERM"
|
||||
msg += ". Exiting"
|
||||
log.debug(msg)
|
||||
|
||||
mach_id = salt.utils.platform.get_machine_identifier()
|
||||
log.debug(f"DGM exiting for machine identifer '{mach_id}'")
|
||||
|
||||
if HAS_PSUTIL:
|
||||
try:
|
||||
process = psutil.Process(os.getpid())
|
||||
cur_pid = os.getpid()
|
||||
process = psutil.Process(cur_pid)
|
||||
if hasattr(process, "children"):
|
||||
for child in process.children(recursive=True):
|
||||
try:
|
||||
|
@ -1084,6 +1089,10 @@ class SignalHandlingProcess(Process):
|
|||
self.pid,
|
||||
os.getpid(),
|
||||
)
|
||||
|
||||
# DGM need to go through and clean up any resources left around like lock files
|
||||
# example lockfile /var/cache/salt/master/gitfs/work/NlJQs6Pss_07AugikCrmqfmqEFrfPbCDBqGLBiCd3oU=/_/update.lk
|
||||
|
||||
except psutil.NoSuchProcess:
|
||||
log.warning(
|
||||
"Unable to kill children of process %d, it does not exist."
|
||||
|
|
Loading…
Add table
Reference in a new issue