Merge pull request #61356 from cmcmarrow/fix_job_hard_cache_corruption

avoids crash on cache corruption
This commit is contained in:
Gareth J. Greenaway 2022-10-10 17:44:08 -07:00 committed by GitHub
commit 810945d5be
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 6 deletions

1
changelog/60170.fixed Normal file
View file

@ -0,0 +1 @@
Fixed cache directory corruption startup error

View file

@ -392,6 +392,20 @@ def get_jids_filter(count, filter_find_job=True):
return ret
def _remove_job_dir(job_path):
"""
Try to remove job dir. In rare cases NotADirectoryError can raise because node corruption.
:param job_path: Path to job
"""
# Remove job dir
try:
shutil.rmtree(job_path)
except (NotADirectoryError, OSError) as err:
log.error("Unable to remove %s: %s", job_path, err)
return False
return True
def clean_old_jobs():
"""
Clean out the old jobs from the job cache
@ -423,7 +437,7 @@ def clean_old_jobs():
if not os.path.isfile(jid_file) and os.path.exists(f_path):
# No jid file means corrupted cache entry, scrub it
# by removing the entire f_path directory
shutil.rmtree(f_path)
_remove_job_dir(f_path)
elif os.path.isfile(jid_file):
jid_ctime = os.stat(jid_file).st_ctime
hours_difference = (time.time() - jid_ctime) / 3600.0
@ -431,10 +445,7 @@ def clean_old_jobs():
t_path
):
# Remove the entire f_path from the original JID dir
try:
shutil.rmtree(f_path)
except OSError as err:
log.error("Unable to remove %s: %s", f_path, err)
_remove_job_dir(f_path)
# Remove empty JID dirs from job cache, if they're old enough.
# JID dirs may be empty either from a previous cache-clean with the bug
@ -447,7 +458,7 @@ def clean_old_jobs():
t_path_ctime = os.stat(t_path).st_ctime
hours_difference = (time.time() - t_path_ctime) / 3600.0
if hours_difference > __opts__["keep_jobs"]:
shutil.rmtree(t_path)
_remove_job_dir(t_path)
def update_endtime(jid, time):

View file

@ -0,0 +1,18 @@
import pytest
from salt.returners.local_cache import _remove_job_dir
from tests.support.mock import patch
@pytest.mark.parametrize("e", (NotADirectoryError, OSError))
def test_remove_job_dir(e):
# Test that _remove_job_dir job will catch error
with patch("shutil.rmtree", side_effect=e("Node Corruption!")):
assert not _remove_job_dir("cache")
# Test that _remove_job_dir job will not catch other errors
with patch("shutil.rmtree", side_effect=FileExistsError()):
try:
_remove_job_dir("cache")
except FileExistsError:
pass