Merge pull request #37469 from terminalmage/issue29010

Rewrite file.extract_hash to improve its matching ability
2025-04-17 10:10:20 +00:00 · 2016-11-06 14:50:01 +13:00 · 2016-11-06 14:50:01 +13:00 · 129b0387e6
commit 129b0387e6
parent 9426b9d5c4 a3f38e5a9f
3 changed files with 346 additions and 70 deletions
--- a/salt/modules/file.py
+++ b/salt/modules/file.py
@ -24,6 +24,7 @@ import os
 import re
 import shutil
 import stat
+import string
 import sys
 import tempfile
 import time
@ -61,14 +62,14 @@ __func_alias__ = {
    'makedirs_': 'makedirs'
 }

-HASHES = [
-            ['sha512', 128],
-            ['sha384', 96],
-            ['sha256', 64],
-            ['sha224', 56],
-            ['sha1', 40],
-            ['md5', 32],
-         ]
+HASHES = {
+    'sha512': 128,
+    'sha384': 96,
+    'sha256': 64,
+    'sha224': 56,
+    'sha1': 40,
+    'md5': 32,
+}


 def __virtual__():
@ -3467,13 +3468,14 @@ def get_managed(
        template,
        source,
        source_hash,
+        source_hash_name,
        user,
        group,
        mode,
        saltenv,
        context,
        defaults,
-        skip_verify,
+        skip_verify=False,
        **kwargs):
    '''
    Return the managed file data for file.managed
@ -3490,20 +3492,26 @@ def get_managed(
    source_hash
        hash of the source file

+    source_hash_name
+        When ``source_hash`` refers to a remote file, this specifies the
+        filename to look for in that file.
+
+        .. versionadded:: 2016.3.5
+
    user
-        user owner
+        Owner of file

    group
-        group owner
+        Group owner of file

    mode
-        file mode
+        Permissions of file

    context
-        variables to add to the environment
+        Variables to add to the template context

    defaults
-        default values of for context_dict
+        Default values of for context_dict

    skip_verify
        If ``True``, hash verification of remote file sources (``http://``,
@ -3516,7 +3524,7 @@ def get_managed(

    .. code-block:: bash

-        salt '*' file.get_managed /etc/httpd/conf.d/httpd.conf jinja salt://http/httpd.conf '{hash_type: 'md5', 'hsum': <md5sum>}' root root '755' base None None
+        salt '*' file.get_managed /etc/httpd/conf.d/httpd.conf jinja salt://http/httpd.conf '{hash_type: 'md5', 'hsum': <md5sum>}' None root root '755' base None None
    '''
    # Copy the file to the minion and templatize it
    sfn = ''
@ -3530,7 +3538,6 @@ def get_managed(
        '''
        return {'hsum': get_hash(path, form='sha256'), 'hash_type': 'sha256'}

-    source_hash_name = kwargs.pop('source_hash_name', None)
    # If we have a source defined, let's figure out what the hash is
    if source:
        urlparsed_source = _urlparse(source)
@ -3572,7 +3579,11 @@ def get_managed(
                            return '', {}, ('Source hash file {0} not found'
                                            .format(source_hash))
                        source_sum = extract_hash(
-                            hash_fn, '', source_hash_name or name)
+                            hash_fn,
+                            '',
+                            name,
+                            source,
+                            source_hash_name)
                        if source_sum is None:
                            return _invalid_source_hash_format()

@ -3654,8 +3665,27 @@ def get_managed(
    return sfn, source_sum, ''


-def extract_hash(hash_fn, hash_type='sha256', file_name=''):
+def extract_hash(hash_fn,
+                 hash_type='sha256',
+                 file_name='',
+                 source='',
+                 source_hash_name=None):
    '''
+    .. versionchanged:: 2016.3.5
+        Prior to this version, only the ``file_name`` argument was considered
+        for filename matches in the hash file. This would be problematic for
+        cases in which the user was relying on a remote checksum file that they
+        do not control, and they wished to use a different name for that file
+        on the minion from the filename on the remote server (and in the
+        checksum file). For example, managing ``/tmp/myfile.tar.gz`` when the
+        remote file was at ``https://mydomain.tld/different_name.tar.gz``. The
+        :py:func:`file.managed <salt.states.file.managed>` state now also
+        passes this function the source URI as well as the ``source_hash_name``
+        (if specified). In cases where ``source_hash_name`` is specified, it
+        takes precedence over both the ``file_name`` and ``source``. When it is
+        not specified, ``file_name`` takes precedence over ``source``. This
+        allows for better capability for matching hashes.
+
    This routine is called from the :mod:`file.managed
    <salt.states.file.managed>` state to pull a hash from a remote file.
    Regular expressions are used line by line on the ``source_hash`` file, to
@ -3677,49 +3707,183 @@ def extract_hash(hash_fn, hash_type='sha256', file_name=''):

    .. code-block:: bash

-        salt '*' file.extract_hash /etc/foo sha512 /path/to/hash/file
+        salt '*' file.extract_hash /path/to/hash/file sha512 /etc/foo
    '''
-    source_sum = None
-    partial_id = False
-    name_sought = os.path.basename(file_name)
-    log.debug('modules.file.py - extract_hash(): Extracting hash for file '
-              'named: {0}'.format(name_sought))
-    with salt.utils.fopen(hash_fn, 'r') as hash_fn_fopen:
-        for hash_variant in HASHES:
-            if hash_type == '' or hash_type == hash_variant[0]:
-                log.debug('modules.file.py - extract_hash(): Will use regex to get'
-                    ' a purely hexadecimal number of length ({0}), presumably hash'
-                    ' type : {1}'.format(hash_variant[1], hash_variant[0]))
-                hash_fn_fopen.seek(0)
-                for line in hash_fn_fopen.read().splitlines():
-                    hash_array = re.findall(r'(?i)(?<![a-z0-9])[a-f0-9]{' + str(hash_variant[1]) + '}(?![a-z0-9])', line)
-                    log.debug('modules.file.py - extract_hash(): From "line": {0} '
-                              'got : {1}'.format(line, hash_array))
-                    if hash_array:
-                        if not partial_id:
-                            source_sum = {'hsum': hash_array[0], 'hash_type': hash_variant[0]}
-                            partial_id = True
-
-                        log.debug('modules.file.py - extract_hash(): Found: {0} '
-                                  '-- {1}'.format(source_sum['hash_type'],
-                                                  source_sum['hsum']))
-
-                        if re.search(name_sought, line):
-                            source_sum = {'hsum': hash_array[0], 'hash_type': hash_variant[0]}
-                            log.debug('modules.file.py - extract_hash: For {0} -- '
-                                      'returning the {1} hash "{2}".'.format(
-                                          name_sought,
-                                          source_sum['hash_type'],
-                                          source_sum['hsum']))
-                            return source_sum
-
-    if partial_id:
-        log.debug('modules.file.py - extract_hash: Returning the partially '
-                  'identified {0} hash "{1}".'.format(
-                       source_sum['hash_type'], source_sum['hsum']))
+    hash_len = HASHES.get(hash_type)
+    if hash_len is None:
+        if hash_type:
+            log.warning(
+                'file.extract_hash: Unsupported hash_type \'%s\', falling '
+                'back to matching any supported hash_type', hash_type
+            )
+            hash_type = ''
+        hash_len_expr = '{0},{1}'.format(min(six.itervalues(HASHES)),
+                                         max(six.itervalues(HASHES)))
    else:
-        log.debug('modules.file.py - extract_hash: Returning None.')
-    return source_sum
+        hash_len_expr = str(hash_len)
+
+    filename_separators = string.whitespace + r'\/'
+
+    if source_hash_name is not None:
+        #if not isinstance(source_hash_name, six.string_types):
+        #    source_hash_name = str(source_hash_name)
+        if not isinstance(source_hash_name, six.string_types):
+            source_hash_name = str(source_hash_name)
+        source_hash_name_idx = (len(source_hash_name) + 1) * -1
+        log.debug(
+            'file.extract_hash: Extracting %s hash for file matching '
+            'source_hash_name \'%s\'',
+            'any supported' if not hash_type else hash_type,
+            source_hash_name
+        )
+    else:
+        if not isinstance(file_name, six.string_types):
+            file_name = str(file_name)
+        if not isinstance(source, six.string_types):
+            source = str(source)
+        urlparsed_source = _urlparse(source)
+        source_basename = os.path.basename(
+            urlparsed_source.path or urlparsed_source.netloc
+        )
+        source_idx = (len(source_basename) + 1) * -1
+        file_name_basename = os.path.basename(file_name)
+        file_name_idx = (len(file_name_basename) + 1) * -1
+        searches = [x for x in (file_name, source) if x]
+        if searches:
+            log.debug(
+                'file.extract_hash: Extracting %s hash for file matching%s: %s',
+                'any supported' if not hash_type else hash_type,
+                '' if len(searches) == 1 else ' either of the following',
+                ', '.join(searches)
+            )
+
+    partial = None
+    found = {}
+    hashes_revmap = dict([(y, x) for x, y in six.iteritems(HASHES)])
+
+    with salt.utils.fopen(hash_fn, 'r') as fp_:
+        for line in fp_:
+            line = line.strip()
+            hash_re = r'(?i)(?<![a-z0-9])([a-f0-9]{' + hash_len_expr + '})(?![a-z0-9])'
+            hash_match = re.search(hash_re, line)
+            matched = None
+            if hash_match:
+                matched_hsum = hash_match.group(1)
+                if matched_hsum is not None:
+                    matched_type = hashes_revmap.get(len(matched_hsum))
+                    if matched_type is None:
+                        # There was a match, but it's not of the correct length
+                        # to match one of the supported hash types.
+                        matched = None
+                    else:
+                        matched = {'hsum': matched_hsum,
+                                   'hash_type': matched_type}
+
+            if matched is None:
+                log.debug(
+                    'file.extract_hash: In line \'%s\', no %shash found',
+                    line,
+                    '' if not hash_type else hash_type + ' '
+                )
+                continue
+
+            if partial is None:
+                partial = matched
+
+            def _add_to_matches(found, line, match_type, value, matched):
+                log.debug(
+                    'file.extract_hash: Line \'%s\' matches %s \'%s\'',
+                    line, match_type, value
+                )
+                found.setdefault(match_type, []).append(matched)
+
+            hash_matched = False
+            if source_hash_name is not None:
+                if line.endswith(source_hash_name):
+                    # Checking the character before where the basename
+                    # should start for either whitespace or a path
+                    # separator. We can't just rsplit on spaces/whitespace,
+                    # because the filename may contain spaces.
+                    try:
+                        if line[source_hash_name_idx] in string.whitespace:
+                            _add_to_matches(found, line, 'source_hash_name',
+                                            source_hash_name, matched)
+                            hash_matched = True
+                    except IndexError:
+                        pass
+                elif re.match(source_hash_name.replace('.', r'\.') + r'\s+',
+                              line):
+                    _add_to_matches(found, line, 'source_hash_name',
+                                    source_hash_name, matched)
+                    hash_matched = True
+            else:
+                if file_name:
+                    if line.endswith(file_name_basename):
+                        # Checking the character before where the basename
+                        # should start for either whitespace or a path
+                        # separator. We can't just rsplit on spaces/whitespace,
+                        # because the filename may contain spaces.
+                        try:
+                            if line[file_name_idx] in filename_separators:
+                                _add_to_matches(found, line, 'file_name',
+                                                file_name, matched)
+                                hash_matched = True
+                        except IndexError:
+                            pass
+                    elif re.match(file_name.replace('.', r'\.') + r'\s+', line):
+                        _add_to_matches(found, line, 'file_name',
+                                        file_name, matched)
+                        hash_matched = True
+                if source:
+                    if line.endswith(source_basename):
+                        # Same as above, we can't just do an rsplit here.
+                        try:
+                            if line[source_idx] in filename_separators:
+                                _add_to_matches(found, line, 'source',
+                                                source, matched)
+                                hash_matched = True
+                        except IndexError:
+                            pass
+                    elif re.match(source.replace('.', r'\.') + r'\s+', line):
+                        _add_to_matches(found, line, 'source', source, matched)
+                        hash_matched = True
+
+            if not hash_matched:
+                log.debug(
+                    'file.extract_hash: Line \'%s\' contains %s hash '
+                    '\'%s\', but line did not meet the search criteria',
+                    line, matched['hash_type'], matched['hsum']
+                )
+
+    for found_type, found_str in (('source_hash_name', source_hash_name),
+                                  ('file_name', file_name),
+                                  ('source', source)):
+        if found_type in found:
+            if len(found[found_type]) > 1:
+                log.debug(
+                    'file.extract_hash: Multiple matches for %s: %s',
+                    found_str,
+                    ', '.join(
+                        ['{0} ({1})'.format(x['hsum'], x['hash_type'])
+                         for x in found[found_type]]
+                    )
+                )
+            ret = found[found_type][0]
+            log.debug(
+                'file.extract_hash: Returning %s hash \'%s\' as a match of %s',
+                ret['hash_type'], ret['hsum'], found_str
+            )
+            return ret
+
+    if partial:
+        log.debug(
+            'file.extract_hash: Returning the partially identified %s hash '
+            '\'%s\'', partial['hash_type'], partial['hsum']
+        )
+        return partial
+
+    log.debug('file.extract_hash: No matches, returning None')
+    return None


 def check_perms(name, ret, user, group, mode, follow_symlinks=False):
--- a/salt/states/file.py
+++ b/salt/states/file.py
@ -1082,6 +1082,7 @@ def missing(name):
 def managed(name,
            source=None,
            source_hash='',
+            source_hash_name=None,
            user=None,
            group=None,
            mode=None,
@ -1188,7 +1189,7 @@ def managed(name,
                    - source: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz
                    - source_hash: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.hash

-            The following is an example of the supported source_hash format:
+            The following lines are all supported formats:

            .. code-block:: text

@ -1198,7 +1199,7 @@ def managed(name,

            Debian file type ``*.dsc`` files are also supported.

-        **Inserting the Source Hash in the sls Data**
+        **Inserting the Source Hash in the SLS Data**
            Examples:

            .. code-block:: yaml
@ -1224,6 +1225,44 @@ def managed(name,
                    - source: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz
                    - source_hash: https://launchpad.net/tomdroid/beta/0.7.3/+download/tomdroid-src-0.7.3.tar.gz/+md5

+    source_hash_name
+        When ``source_hash`` refers to a hash file, Salt will try to find the
+        correct hash by matching the filename associated with that hash. By
+        default, Salt will look for the filename being managed. When managing a
+        file at path ``/tmp/foo.txt``, then the following line in a hash file
+        would match:
+
+        .. code-block:: text
+
+            acbd18db4cc2f85cedef654fccc4a4d8    foo.txt
+
+        However, sometimes a hash file will include multiple similar paths:
+
+        .. code-block:: text
+
+            37b51d194a7513e45b56f6524f2d51f2    ./dir1/foo.txt
+            acbd18db4cc2f85cedef654fccc4a4d8    ./dir2/foo.txt
+            73feffa4b7f6bb68e44cf984c85f6e88    ./dir3/foo.txt
+
+        In cases like this, Salt may match the incorrect hash. This argument
+        can be used to tell Salt which filename to match, to ensure that the
+        correct hash is identified. For example:
+
+        .. code-block:: yaml
+
+            /tmp/foo.txt:
+              file.managed:
+                - source: https://mydomain.tld/dir2/foo.txt
+                - source_hash: https://mydomain.tld/hashes
+                - source_hash_name: ./dir2/foo.txt
+
+        .. note::
+            This argument must contain the full filename entry from the
+            checksum file, as this argument is meant to disambiguate matches
+            for multiple files that have the same basename. So, in the
+            example above, simply using ``foo.txt`` would not match.
+
+        .. versionadded:: 2016.3.5

    user
        The user to own the file, this defaults to the user salt is running as
@ -1685,6 +1724,7 @@ def managed(name,
            template,
            source,
            source_hash,
+            source_hash_name,
            user,
            group,
            mode,
--- a/tests/unit/modules/file_test.py
+++ b/tests/unit/modules/file_test.py
@ -483,10 +483,14 @@ class FileModuleTestCase(TestCase):
        '''
        # With file name
        with tempfile.NamedTemporaryFile(mode='w+') as tfile:
-            tfile.write('rc.conf ef6e82e4006dee563d98ada2a2a80a27\n')
            tfile.write(
-                'ead48423703509d37c4a90e6a0d53e143b6fc268 example.tar.gz\n')
+                'rc.conf ef6e82e4006dee563d98ada2a2a80a27\n'
+                'ead48423703509d37c4a90e6a0d53e143b6fc268 example.tar.gz\n'
+                'fe05bcdcdc4928012781a5f1a2a77cbb5398e106 ./subdir/example.tar.gz\n'
+                'ad782ecdac770fc6eb9a62e44f90873fb97fb26b foo.tar.bz2\n'
+            )
            tfile.flush()
+
            result = filemod.extract_hash(tfile.name, '', '/rc.conf')
            self.assertEqual(result, {
                'hsum': 'ef6e82e4006dee563d98ada2a2a80a27',
@ -498,15 +502,83 @@ class FileModuleTestCase(TestCase):
                'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
                'hash_type': 'sha1'
            })
-        # Solohash - no file name (Maven repo checksum file format)
+
+            # All the checksums in this test file are sha1 sums. We run this
+            # loop three times. The first pass tests auto-detection of hash
+            # type by length of the hash. The second tests matching a specific
+            # type. The third tests a failed attempt to match a specific type,
+            # since sha256 was requested but sha1 is what is in the file.
+            for hash_type in ('', 'sha1', 'sha256'):
+                # Test the source_hash_name argument. Even though there are
+                # matches in the source_hash file for both the file_name and
+                # source params, they should be ignored in favor of the
+                # source_hash_name.
+                file_name = '/example.tar.gz'
+                source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
+                source_hash_name = './subdir/example.tar.gz'
+                result = filemod.extract_hash(
+                    tfile.name,
+                    hash_type,
+                    file_name,
+                    source,
+                    source_hash_name)
+                expected = {
+                    'hsum': 'fe05bcdcdc4928012781a5f1a2a77cbb5398e106',
+                    'hash_type': 'sha1'
+                } if hash_type != 'sha256' else None
+                self.assertEqual(result, expected)
+
+                # Test both a file_name and source but no source_hash_name.
+                # Even though there are matches for both file_name and
+                # source_hash_name, file_name should be preferred.
+                file_name = '/example.tar.gz'
+                source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
+                source_hash_name = None
+                result = filemod.extract_hash(
+                    tfile.name,
+                    hash_type,
+                    file_name,
+                    source,
+                    source_hash_name)
+                expected = {
+                    'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
+                    'hash_type': 'sha1'
+                } if hash_type != 'sha256' else None
+                self.assertEqual(result, expected)
+
+                # Test both a file_name and source but no source_hash_name.
+                # Since there is no match for the file_name, the source is
+                # matched.
+                file_name = '/somefile.tar.gz'
+                source = 'https://mydomain.tld/foo.tar.bz2?key1=val1&key2=val2'
+                source_hash_name = None
+                result = filemod.extract_hash(
+                    tfile.name,
+                    hash_type,
+                    file_name,
+                    source,
+                    source_hash_name)
+                expected = {
+                    'hsum': 'ad782ecdac770fc6eb9a62e44f90873fb97fb26b',
+                    'hash_type': 'sha1'
+                } if hash_type != 'sha256' else None
+                self.assertEqual(result, expected)
+
+        # Hash only, no file name (Maven repo checksum format)
+        # Since there is no name match, the first checksum in the file will
+        # always be returned, never the second.
        with tempfile.NamedTemporaryFile(mode='w+') as tfile:
-            tfile.write('ead48423703509d37c4a90e6a0d53e143b6fc268\n')
+            tfile.write('ead48423703509d37c4a90e6a0d53e143b6fc268\n'
+                        'ad782ecdac770fc6eb9a62e44f90873fb97fb26b\n')
            tfile.flush()
-            result = filemod.extract_hash(tfile.name, '', '/testfile')
-            self.assertEqual(result, {
-                'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
-                'hash_type': 'sha1'
-            })
+
+            for hash_type in ('', 'sha1', 'sha256'):
+                result = filemod.extract_hash(tfile.name, hash_type, '/testfile')
+                expected = {
+                    'hsum': 'ead48423703509d37c4a90e6a0d53e143b6fc268',
+                    'hash_type': 'sha1'
+                } if hash_type != 'sha256' else None
+                self.assertEqual(result, expected)

    def test_user_to_uid_int(self):
        '''