delete s3fs: delete files from cache when deleted from bucket

This commit is contained in:
Skyler Hawthorne 2023-11-25 07:51:52 -05:00 committed by Daniel Wozniak
parent fcb38a54d5
commit e005c26fff
3 changed files with 123 additions and 16 deletions

7
changelog/65611.fixed.md Normal file
View file

@ -0,0 +1,7 @@
When using s3fs, if files are deleted from the bucket, they were not deleted in
the master or minion local cache, which could lead to unexpected file copies or
even state applications. This change makes the local cache consistent with the
remote bucket by deleting files locally that are deleted from the bucket.
**NOTE** this could lead to **breakage** on your affected systems if it was
inadvertently depending on previously deleted files.

View file

@ -135,6 +135,7 @@ def update():
cached_file_path = _get_cached_file_name(
bucket, saltenv, file_path
)
log.debug("%s - %s : %s", bucket, saltenv, file_path)
# load the file from S3 if it's not in the cache or it's old
@ -356,6 +357,7 @@ def _init():
# check mtime of the buckets files cache
metadata = None
try:
if os.path.getmtime(cache_file) > exp:
metadata = _read_buckets_cache_file(cache_file)
@ -366,6 +368,8 @@ def _init():
# bucket files cache expired or does not exist
metadata = _refresh_buckets_cache_file(cache_file)
_prune_deleted_files(metadata)
return metadata
@ -374,7 +378,6 @@ def _get_cache_dir():
Return the path to the s3cache dir
"""
# Or is that making too many assumptions?
return os.path.join(__opts__["cachedir"], "s3cache")
@ -383,26 +386,15 @@ def _get_cached_file_name(bucket_name, saltenv, path):
Return the cached file name for a bucket path file
"""
file_path = os.path.join(_get_cache_dir(), saltenv, bucket_name, path)
# make sure bucket and saltenv directories exist
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))
return file_path
return os.path.join(_get_cache_dir(), saltenv, bucket_name, path)
def _get_buckets_cache_filename():
"""
Return the filename of the cache for bucket contents.
Create the path if it does not exist.
"""
cache_dir = _get_cache_dir()
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
return os.path.join(cache_dir, "buckets_files.cache")
return os.path.join(_get_cache_dir(), "buckets_files.cache")
def _refresh_buckets_cache_file(cache_file):
@ -423,6 +415,7 @@ def _refresh_buckets_cache_file(cache_file):
path_style,
https_enable,
) = _get_s3_key()
metadata = {}
# helper s3 query function
@ -571,10 +564,71 @@ def _refresh_buckets_cache_file(cache_file):
return metadata
def _prune_deleted_files(metadata):
cache_dir = _get_cache_dir()
cached_files = set()
roots = set()
if _is_env_per_bucket():
for env, env_data in metadata.items():
for bucket_meta in env_data:
for bucket, bucket_data in bucket_meta.items():
root = os.path.join(cache_dir, env, bucket)
if os.path.exists(root):
roots.add(root)
for meta in bucket_data:
path = meta["Key"]
cached_files.add(path)
else:
for env, env_data in metadata.items():
for bucket in _get_buckets():
root = os.path.join(cache_dir, bucket)
if os.path.exists(root):
roots.add(root)
for meta in env_data:
cached_files.add(meta["Key"])
if log.isEnabledFor(logging.DEBUG):
import pprint
log.debug(f"cached file list: {pprint.pformat(cached_files)}")
for root in roots:
for base, dirs, files in os.walk(root):
for file_name in files:
path = os.path.join(base, file_name)
relpath = os.path.relpath(path, root)
if relpath not in cached_files:
log.debug(f"file '{path}' not found in cached file list")
log.info(
f"file '{relpath}' was deleted from bucket, deleting local copy"
)
os.unlink(path)
dir = os.path.dirname(path)
# delete empty dirs all the way up to the cache dir
while dir != cache_dir and len(os.listdir(dir)) == 0:
log.debug(f"directory '{dir}' is now empty, removing")
os.rmdir(dir)
dir = os.path.dirname(dir)
def _write_buckets_cache_file(metadata, cache_file):
"""
Write the contents of the buckets cache file
"""
cache_dir = _get_cache_dir()
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
if os.path.isfile(cache_file):
os.remove(cache_file)
@ -591,6 +645,10 @@ def _read_buckets_cache_file(cache_file):
log.debug("Reading buckets cache file")
if not os.path.exists(cache_file):
log.debug("Cache file does not exist")
return None
with salt.utils.files.fopen(cache_file, "rb") as fp_:
try:
data = pickle.load(fp_)
@ -698,6 +756,13 @@ def _get_file_from_s3(metadata, saltenv, bucket_name, path, cached_file_path):
Checks the local cache for the file, if it's old or missing go grab the
file from S3 and update the cache
"""
# make sure bucket and saltenv directories exist
target_dir = os.path.dirname(cached_file_path)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
(
key,
keyid,

View file

@ -78,6 +78,7 @@ def test_update(bucket, s3):
"top.sls": {"content": yaml.dump({"base": {"*": ["foo"]}})},
"foo.sls": {"content": yaml.dump({"nginx": {"pkg.installed": []}})},
"files/nginx.conf": {"content": "server {}"},
"files/conf.d/foo.conf": {"content": "server {}"},
}
make_keys(bucket, s3, keys)
@ -90,6 +91,41 @@ def test_update(bucket, s3):
s3fs.update()
verify_cache(bucket, keys)
# verify that when files get deleted from s3, they also get deleted in
# the local cache
delete_file = "files/nginx.conf"
del keys[delete_file]
s3.delete_object(Bucket=bucket, Key=delete_file)
s3fs.update()
verify_cache(bucket, keys)
cache_file = s3fs._get_cached_file_name(bucket, "base", delete_file)
assert not os.path.exists(cache_file)
# we want empty directories to get deleted from the local cache
# after this one, `files` should still exist
files_dir = os.path.dirname(cache_file)
assert os.path.exists(files_dir)
# but after the last file is deleted, the directory and any parents
# should be deleted too
delete_file = "files/conf.d/foo.conf"
del keys[delete_file]
s3.delete_object(Bucket=bucket, Key=delete_file)
s3fs.update()
verify_cache(bucket, keys)
cache_file = s3fs._get_cached_file_name(bucket, "base", delete_file)
assert not os.path.exists(cache_file)
# after this, `files/conf.d` and `files` should be deleted
conf_d_dir = os.path.dirname(cache_file)
assert not os.path.exists(conf_d_dir)
assert not os.path.exists(files_dir)
@pytest.mark.skip_on_fips_enabled_platform
def test_s3_hash(bucket, s3):
@ -124,8 +160,7 @@ def test_s3_hash(bucket, s3):
@pytest.mark.skip_on_fips_enabled_platform
def test_cache_round_trip(bucket):
metadata = {"foo": "bar"}
cache_file = s3fs._get_cached_file_name(bucket, "base", "somefile")
cache_file = s3fs._get_buckets_cache_filename()
s3fs._write_buckets_cache_file(metadata, cache_file)
assert s3fs._read_buckets_cache_file(cache_file) == metadata