Merge pull request #37469 from terminalmage/issue29010

Rewrite file.extract_hash to improve its matching ability
This commit is contained in:
Mike Place 2016-11-06 14:50:01 +13:00 committed by GitHub
commit 129b0387e6
3 changed files with 346 additions and 70 deletions

View file

@ -24,6 +24,7 @@ import os
import re
import shutil
import stat
import string
import sys
import tempfile
import time
@ -61,14 +62,14 @@ __func_alias__ = {
'makedirs_': 'makedirs'
}
HASHES = [
['sha512', 128],
['sha384', 96],
['sha256', 64],
['sha224', 56],
['sha1', 40],
['md5', 32],
]
HASHES = {
'sha512': 128,
'sha384': 96,
'sha256': 64,
'sha224': 56,
'sha1': 40,
'md5': 32,
}
def __virtual__():
@ -3467,13 +3468,14 @@ def get_managed(
template,
source,
source_hash,
source_hash_name,
user,
group,
mode,
saltenv,
context,
defaults,
skip_verify,
skip_verify=False,
**kwargs):
'''
Return the managed file data for file.managed
@ -3490,20 +3492,26 @@ def get_managed(
source_hash
hash of the source file
source_hash_name
When ``source_hash`` refers to a remote file, this specifies the
filename to look for in that file.
.. versionadded:: 2016.3.5
user
user owner
Owner of file
group
group owner
Group owner of file
mode
file mode
Permissions of file
context
variables to add to the environment
Variables to add to the template context
defaults
default values of for context_dict
Default values of for context_dict
skip_verify
If ``True``, hash verification of remote file sources (``http://``,
@ -3516,7 +3524,7 @@ def get_managed(
.. code-block:: bash
salt '*' file.get_managed /etc/httpd/conf.d/httpd.conf jinja salt://http/httpd.conf '{hash_type: 'md5', 'hsum': <md5sum>}' root root '755' base None None
salt '*' file.get_managed /etc/httpd/conf.d/httpd.conf jinja salt://http/httpd.conf '{hash_type: 'md5', 'hsum': <md5sum>}' None root root '755' base None None
'''
# Copy the file to the minion and templatize it
sfn = ''
@ -3530,7 +3538,6 @@ def get_managed(
'''
return {'hsum': get_hash(path, form='sha256'), 'hash_type': 'sha256'}
source_hash_name = kwargs.pop('source_hash_name', None)
# If we have a source defined, let's figure out what the hash is
if source:
urlparsed_source = _urlparse(source)
@ -3572,7 +3579,11 @@ def get_managed(
return '', {}, ('Source hash file {0} not found'
.format(source_hash))
source_sum = extract_hash(
hash_fn, '', source_hash_name or name)
hash_fn,
'',
name,
source,
source_hash_name)
if source_sum is None:
return _invalid_source_hash_format()
@ -3654,8 +3665,27 @@ def get_managed(
return sfn, source_sum, ''
def extract_hash(hash_fn, hash_type='sha256', file_name=''):
def extract_hash(hash_fn,
hash_type='sha256',
file_name='',
source='',
source_hash_name=None):
'''
.. versionchanged:: 2016.3.5
Prior to this version, only the ``file_name`` argument was considered
for filename matches in the hash file. This would be problematic for
cases in which the user was relying on a remote checksum file that they
do not control, and they wished to use a different name for that file
on the minion from the filename on the remote server (and in the
checksum file). For example, managing ``/tmp/myfile.tar.gz`` when the
remote file was at ``https://mydomain.tld/different_name.tar.gz``. The
:py:func:`file.managed <salt.states.file.managed>` state now also
passes this function the source URI as well as the ``source_hash_name``
(if specified). In cases where ``source_hash_name`` is specified, it
takes precedence over both the ``file_name`` and ``source``. When it is
not specified, ``file_name`` takes precedence over ``source``. This
allows for better capability for matching hashes.
This routine is called from the :mod:`file.managed
<salt.states.file.managed>` state to pull a hash from a remote file.
Regular expressions are used line by line on the ``source_hash`` file, to
@ -3677,49 +3707,183 @@ def extract_hash(hash_fn, hash_type='sha256', file_name=''):
.. code-block:: bash
salt '*' file.extract_hash /etc/foo sha512 /path/to/hash/file
salt '*' file.extract_hash /path/to/hash/file sha512 /etc/foo
'''
source_sum = None
partial_id = False
name_sought = os.path.basename(file_name)
log.debug('modules.file.py - extract_hash(): Extracting hash for file '
'named: {0}'.format(name_sought))
with salt.utils.fopen(hash_fn, 'r') as hash_fn_fopen:
for hash_variant in HASHES:
if hash_type == '' or hash_type == hash_variant[0]:
log.debug('modules.file.py - extract_hash(): Will use regex to get'
' a purely hexadecimal number of length ({0}), presumably hash'
' type : {1}'.format(hash_variant[1], hash_variant[0]))
hash_fn_fopen.seek(0)
for line in hash_fn_fopen.read().splitlines():
hash_array = re.findall(r'(?i)(?<![a-z0-9])[a-f0-9]{' + str(hash_variant[1]) + '}(?![a-z0-9])', line)
log.debug('modules.file.py - extract_hash(): From "line": {0} '
'got : {1}'.format(line, hash_array))
if hash_array:
if not partial_id:
source_sum = {'hsum': hash_array[0], 'hash_type': hash_variant[0]}
partial_id = True
log.debug('modules.file.py - extract_hash(): Found: {0} '
'-- {1}'.format(source_sum['hash_type'],
source_sum['hsum']))
if re.search(name_sought, line):
source_sum = {'hsum': hash_array[0], 'hash_type': hash_variant[0]}
log.debug('modules.file.py - extract_hash: For {0} -- '
'returning the {1} hash "{2}".'.format(
name_sought,
source_sum['hash_type'],
source_sum['hsum']))
return source_sum
if partial_id:
log.debug('modules.file.py - extract_hash: Returning the partially '
'identified {0} hash "{1}".'.format(
source_sum['hash_type'], source_sum['hsum']))
hash_len = HASHES.get(hash_type)
if hash_len is None:
if hash_type:
log.warning(
'file.extract_hash: Unsupported hash_type \'%s\', falling '
'back to matching any supported hash_type', hash_type
)
hash_type = ''
hash_len_expr = '{0},{1}'.format(min(six.itervalues(HASHES)),
max(six.itervalues(HASHES)))
else:
log.debug('modules.file.py - extract_hash: Returning None.')
return source_sum
hash_len_expr = str(hash_len)
filename_separators = string.whitespace + r'\/'
if source_hash_name is not None:
#if not isinstance(source_hash_name, six.string_types):
# source_hash_name = str(source_hash_name)
if not isinstance(source_hash_name, six.string_types):
source_hash_name = str(source_hash_name)
source_hash_name_idx = (len(source_hash_name) + 1) * -1
log.debug(
'file.extract_hash: Extracting %s hash for file matching '
'source_hash_name \'%s\'',
'any supported' if not hash_type else hash_type,
source_hash_name
)
else:
if not isinstance(file_name, six.string_types):
file_name = str(file_name)
if not isinstance(source, six.string_types):
source = str(source)
urlparsed_source = _urlparse(source)
source_basename = os.path.basename(
urlparsed_source.path or urlparsed_source.netloc
)
source_idx = (len(source_basename) + 1) * -1
file_name_basename = os.path.basename(file_name)
file_name_idx = (len(file_name_basename) + 1) * -1
searches = [x for x in (file_name, source) if x]
if searches:
log.debug(
'file.extract_hash: Extracting %s hash for file matching%s: %s',
'any supported' if not hash_type else hash_type,
'' if len(searches) == 1 else ' either of the following',
', '.join(searches)
)
partial = None
found = {}
hashes_revmap = dict([(y, x) for x, y in six.iteritems(HASHES)])
with salt.utils.fopen(hash_fn, 'r') as fp_:
for line in fp_:
line = line.strip()
hash_re = r'(?i)(?<![a-z0-9])([a-f0-9]{' + hash_len_expr + '})(?![a-z0-9])'
hash_match = re.search(hash_re, line)
matched = None
if hash_match:
matched_hsum = hash_match.group(1)
if matched_hsum is not None:
matched_type = hashes_revmap.get(len(matched_hsum))
if matched_type is None:
# There was a match, but it's not of the correct length
# to match one of the supported hash types.
matched = None
else:
matched = {'hsum': matched_hsum,
'hash_type': matched_type}
if matched is None:
log.debug(
'file.extract_hash: In line \'%s\', no %shash found',
line,
'' if not hash_type else hash_type + ' '
)
continue
if partial is None:
partial = matched
def _add_to_matches(found, line, match_type, value, matched):
log.debug(
'file.extract_hash: Line \'%s\' matches %s \'%s\'',
line, match_type, value
)
found.setdefault(match_type, []).append(matched)
hash_matched = False
if source_hash_name is not None:
if line.endswith(source_hash_name):
# Checking the character before where the basename
# should start for either whitespace or a path
# separator. We can't just rsplit on spaces/whitespace,
# because the filename may contain spaces.
try:
if line[source_hash_name_idx] in string.whitespace:
_add_to_matches(found, line, 'source_hash_name',
source_hash_name, matched)
hash_matched = True
except IndexError:
pass
elif re.match(source_hash_name.replace('.', r'\.') + r'\s+',
line):
_add_to_matches(found, line, 'source_hash_name',
source_hash_name, matched)
hash_matched = True
else:
if file_name:
if line.endswith(file_name_basename):
# Checking the character before where the basename
# should start for either whitespace or a path
# separator. We can't just rsplit on spaces/whitespace,
# because the filename may contain spaces.
try:
if line[file_name_idx] in filename_separators:
_add_to_matches(found, line, 'file_name',
file_name, matched)
hash_matched = True
except IndexError:
pass
elif re.match(file_name.replace('.', r'\.') + r'\s+', line):
_add_to_matches(found, line, 'file_name',
file_name, matched)
hash_matched = True
if source:
if line.endswith(source_basename):
# Same as above, we can't just do an rsplit here.
try:
if line[source_idx] in filename_separators:
_add_to_matches(found, line, 'source',
source, matched)
hash_matched = True
except IndexError:
pass
elif re.match(source.replace('.', r'\.') + r'\s+', line):
_add_to_matches(found, line, 'source', source, matched)
hash_matched = True
if not hash_matched:
log.debug(
'file.extract_hash: Line \'%s\' contains %s hash '
'\'%s\', but line did not meet the search criteria',
line, matched['hash_type'], matched['hsum']
)
for found_type, found_str in (('source_hash_name', source_hash_name),
('file_name', file_name),
('source', source)):
if found_type in found:
if len(found[found_type]) > 1:
log.debug(
'file.extract_hash: Multiple matches for %s: %s',
found_str,
', '.join(
['{0} ({1})'.format(x['hsum'], x['hash_type'])
for x in found[found_type]]
)
)
ret = found[found_type][0]
log.debug(
'file.extract_hash: Returning %s hash \'%s\' as a match of %s',
ret['hash_type'], ret['hsum'], found_str
)
return ret
if partial:
log.debug(
'file.extract_hash: Returning the partially identified %s hash '
'\'%s\'', partial['hash_type'], partial['hsum']
)
return partial
log.debug('file.extract_hash: No matches, returning None')
return None
def check_perms(name, ret, user, group, mode, follow_symlinks=False):

View file

@ -1082,6 +1082,7 @@ def missing(name):
def managed(name,
source=None,
source_hash='',
source_hash_name=None,
user=None,
group=None,
mode=None,
@ -1188,7 +1189,7 @@ def managed(name,
- source: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz
- source_hash: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.hash
The following is an example of the supported source_hash format:
The following lines are all supported formats:
.. code-block:: text
@ -1198,7 +1199,7 @@ def managed(name,
Debian file type ``*.dsc`` files are also supported.
**Inserting the Source Hash in the sls Data**
**Inserting the Source Hash in the SLS Data**
Examples:
.. code-block:: yaml
@ -1224,6 +1225,44 @@ def managed(name,
- source: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz
- source_hash: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz/+md5
source_hash_name
When ``source_hash`` refers to a hash file, Salt will try to find the
correct hash by matching the filename associated with that hash. By
default, Salt will look for the filename being managed. When managing a
file at path ``/tmp/foo.txt``, then the following line in a hash file
would match:
.. code-block:: text
acbd18db4cc2f85cedef654fccc4a4d8 foo.txt
However, sometimes a hash file will include multiple similar paths:
.. code-block:: text
37b51d194a7513e45b56f6524f2d51f2 ./dir1/foo.txt
acbd18db4cc2f85cedef654fccc4a4d8 ./dir2/foo.txt
73feffa4b7f6bb68e44cf984c85f6e88 ./dir3/foo.txt
In cases like this, Salt may match the incorrect hash. This argument
can be used to tell Salt which filename to match, to ensure that the
correct hash is identified. For example:
.. code-block:: yaml
/tmp/foo.txt:
file.managed:
- source: https://mydomain.tld/dir2/foo.txt
- source_hash: https://mydomain.tld/hashes
- source_hash_name: ./dir2/foo.txt
.. note::
This argument must contain the full filename entry from the
checksum file, as this argument is meant to disambiguate matches
for multiple files that have the same basename. So, in the
example above, simply using ``foo.txt`` would not match.
.. versionadded:: 2016.3.5
user
The user to own the file, this defaults to the user salt is running as
@ -1685,6 +1724,7 @@ def managed(name,
template,
source,
source_hash,
source_hash_name,
user,
group,
mode,

View file

@ -483,10 +483,14 @@ class FileModuleTestCase(TestCase):
'''
# With file name
with tempfile.NamedTemporaryFile(mode='w+') as tfile:
tfile.write('rc.conf ef6e82e4006dee563d98ada2a2a80a27\n')
tfile.write(
'ead48423703509d37c4a90e6a0d53e143b6fc268 example.tar.gz\n')
'rc.conf ef6e82e4006dee563d98ada2a2a80a27\n'
'ead48423703509d37c4a90e6a0d53e143b6fc268 example.tar.gz\n'
'fe05bcdcdc4928012781a5f1a2a77cbb5398e106 ./subdir/example.tar.gz\n'
'ad782ecdac770fc6eb9a62e44f90873fb97fb26b foo.tar.bz2\n'
)
tfile.flush()
result = filemod.extract_hash(tfile.name, '', '/rc.conf')
self.assertEqual(result, {
'hsum': 'ef6e82e4006dee563d98ada2a2a80a27',
@ -498,15 +502,83 @@ class FileModuleTestCase(TestCase):
'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
'hash_type': 'sha1'
})
# Solohash - no file name (Maven repo checksum file format)
# All the checksums in this test file are sha1 sums. We run this
# loop three times. The first pass tests auto-detection of hash
# type by length of the hash. The second tests matching a specific
# type. The third tests a failed attempt to match a specific type,
# since sha256 was requested but sha1 is what is in the file.
for hash_type in ('', 'sha1', 'sha256'):
# Test the source_hash_name argument. Even though there are
# matches in the source_hash file for both the file_name and
# source params, they should be ignored in favor of the
# source_hash_name.
file_name = '/example.tar.gz'
source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
source_hash_name = './subdir/example.tar.gz'
result = filemod.extract_hash(
tfile.name,
hash_type,
file_name,
source,
source_hash_name)
expected = {
'hsum': 'fe05bcdcdc4928012781a5f1a2a77cbb5398e106',
'hash_type': 'sha1'
} if hash_type != 'sha256' else None
self.assertEqual(result, expected)
# Test both a file_name and source but no source_hash_name.
# Even though there are matches for both file_name and
# source_hash_name, file_name should be preferred.
file_name = '/example.tar.gz'
source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
source_hash_name = None
result = filemod.extract_hash(
tfile.name,
hash_type,
file_name,
source,
source_hash_name)
expected = {
'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
'hash_type': 'sha1'
} if hash_type != 'sha256' else None
self.assertEqual(result, expected)
# Test both a file_name and source but no source_hash_name.
# Since there is no match for the file_name, the source is
# matched.
file_name = '/somefile.tar.gz'
source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
source_hash_name = None
result = filemod.extract_hash(
tfile.name,
hash_type,
file_name,
source,
source_hash_name)
expected = {
'hsum': 'ad782ecdac770fc6eb9a62e44f90873fb97fb26b',
'hash_type': 'sha1'
} if hash_type != 'sha256' else None
self.assertEqual(result, expected)
# Hash only, no file name (Maven repo checksum format)
# Since there is no name match, the first checksum in the file will
# always be returned, never the second.
with tempfile.NamedTemporaryFile(mode='w+') as tfile:
tfile.write('ead48423703509d37c4a90e6a0d53e143b6fc268\n')
tfile.write('ead48423703509d37c4a90e6a0d53e143b6fc268\n'
'ad782ecdac770fc6eb9a62e44f90873fb97fb26b\n')
tfile.flush()
result = filemod.extract_hash(tfile.name, '', '/testfile')
self.assertEqual(result, {
'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
'hash_type': 'sha1'
})
for hash_type in ('', 'sha1', 'sha256'):
result = filemod.extract_hash(tfile.name, hash_type, '/testfile')
expected = {
'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
'hash_type': 'sha1'
} if hash_type != 'sha256' else None
self.assertEqual(result, expected)
def test_user_to_uid_int(self):
'''