""" These commands are used to create/destroy VMs, sync the local checkout to the VM and to run commands on the VM. """ # pylint: disable=resource-leakage,broad-except,3rd-party-module-not-gated from __future__ import annotations import hashlib import json import logging import os import pathlib import platform import pprint import shutil import subprocess import sys import textwrap import time from datetime import datetime from functools import lru_cache from typing import TYPE_CHECKING, cast import attr import boto3 from botocore.exceptions import ClientError from ptscripts import Context, command_group from requests.exceptions import ConnectTimeout from rich.progress import ( BarColumn, Column, Progress, TaskProgressColumn, TextColumn, TimeRemainingColumn, ) import tools.utils if TYPE_CHECKING: # pylint: disable=no-name-in-module from boto3.resources.factory.ec2 import Instance # pylint: enable=no-name-in-module log = logging.getLogger(__name__) STATE_DIR = tools.utils.REPO_ROOT / ".vms-state" REPO_CHECKOUT_ID = hashlib.sha256( "|".join(list(platform.uname()) + [str(tools.utils.REPO_ROOT)]).encode() ).hexdigest() AWS_REGION = ( os.environ.get("AWS_DEFAULT_REGION") or os.environ.get("AWS_REGION") or "us-west-2" ) # Define the command group vm = command_group(name="vm", help="VM Related Commands", description=__doc__) vm.add_argument("--region", help="The AWS region.", default=AWS_REGION) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", "choices": sorted(tools.utils.get_golden_images()), }, "key_name": { "help": "The SSH key name. Will default to TOOLS_KEY_NAME in environment", }, "instance_type": { "help": "The instance type to use.", }, "no_delete": { "help": ( "By default, every VM started will get terminated after a specific " "ammount of hours. When true, the started VM get's excluded from that " "forced termination." ), }, "no_destroy_on_failure": { "help": "Do not destroy the instance on failing to create and connect.", }, "retries": { "help": "How many times to retry creating and connecting to a vm", }, "environment": { "help": ( "The AWS environment to use. When the value is auto, an " "attempt will be made to get the right environment from the " "AWS instance metadata endpoint. This only works for bastion " "VMs." ), "choices": ("prod", "test", "auto"), }, } ) def create( ctx: Context, name: str, key_name: str = os.environ.get("RUNNER_NAME") or os.environ.get("TOOLS_KEY_NAME"), # type: ignore[assignment] instance_type: str = None, no_delete: bool = False, no_destroy_on_failure: bool = False, retries: int = 0, environment: str = None, ): """ Create VM. """ if key_name is None: ctx.exit(1, "We need a key name to spin a VM") if not retries: retries = 1 if environment == "auto": # Lets get the environment from the instance profile if we're on a bastion VM with ctx.web as web: try: ret = web.put( "http://169.254.169.254/latest/api/token", headers={"X-aws-ec2-metadata-token-ttl-seconds": "10"}, timeout=1, ) token = ret.text.strip() ret = web.get( "http://169.254.169.254/latest/meta-data/tags/instance/spb:environment", headers={"X-aws-ec2-metadata-token": token}, ) spb_environment = ret.text.strip() if spb_environment: ctx.info(f"Discovered VM environment: {spb_environment}") environment = spb_environment except ConnectTimeout: # We're apparently not in bastion VM environment = None attempts = 0 while True: attempts += 1 vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) created = vm.create( key_name=key_name, instance_type=instance_type, no_delete=no_delete, environment=environment, ) if created is True: break ctx.error(created) if no_destroy_on_failure is False: vm.destroy() if attempts >= retries: ctx.exit(1) ctx.info("Retrying in 5 seconds...") time.sleep(5) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "no_wait": { "help": ( "Don't wait for the destroy process to complete. " "Just confirm it started and exit." ) }, } ) def destroy(ctx: Context, name: str, no_wait: bool = False): """ Destroy VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) vm.destroy(no_wait=no_wait) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "command": { "help": "Command to run in VM", "nargs": "*", }, "sudo": { "help": "Run command as sudo", "action": "store_true", }, } ) def ssh(ctx: Context, name: str, command: list[str], sudo: bool = False): """ SSH into the VM, or run 'command' in VM """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) pseudo_terminal = command == [] vm.run(command, sudo=sudo, capture=False, pseudo_terminal=pseudo_terminal) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "download": { "help": "Rsync from the remote target to local salt checkout", "action": "store_true", }, } ) def rsync(ctx: Context, name: str, download: bool = False): """ Sync local checkout to VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) vm.upload_checkout(download=download) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "nox_session": { "flags": [ "-e", "--nox-session", ], "help": "The nox session name to run in the VM", }, "nox_session_args": { "help": "Extra CLI arguments to pass to pytest", "nargs": "*", "metavar": "NOX_SESSION_ARGS", }, "rerun_failures": { "help": "Re-run test failures", "action": "store_true", }, "skip_requirements_install": { "help": "Skip requirements installation", "action": "store_true", "flags": [ "--sri", "--skip-requirements-install", ], }, "print_tests_selection": { "help": "Print the tests selection", "action": "store_true", "flags": [ "--pts", "--print-tests-selection", ], }, "print_system_info": { "help": "Print the system information", "action": "store_true", "flags": [ "--psi", "--print-system-information", ], }, "print_system_info_only": { "help": "Print the system information and exit", "action": "store_true", "flags": [ "--psio", "--print-system-information-only", ], }, "skip_code_coverage": { "help": "Skip tracking code coverage", "action": "store_true", "flags": [ "--scc", "--skip-code-coverage", ], }, "envvars": { "action": "append", "flags": [ "-E", "--env", ], "help": ( "Environment variable name to forward when running tests. Example: " "'-E VAR1 -E VAR2'." ), }, } ) def test( ctx: Context, name: str, nox_session_args: list[str] = None, nox_session: str = "ci-test-3", rerun_failures: bool = False, skip_requirements_install: bool = False, print_tests_selection: bool = False, print_system_info: bool = False, print_system_info_only: bool = False, skip_code_coverage: bool = False, envvars: list[str] = None, fips: bool = False, ): """ Run test in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) env = { "PRINT_TEST_PLAN_ONLY": "0", "SKIP_INITIAL_ONEDIR_FAILURES": "1", "SKIP_INITIAL_GH_ACTIONS_FAILURES": "1", "COVERAGE_CONTEXT": name, } if "LANG" in os.environ: env["LANG"] = os.environ["LANG"] if rerun_failures: env["RERUN_FAILURES"] = "1" if print_tests_selection: env["PRINT_TEST_SELECTION"] = "1" else: env["PRINT_TEST_SELECTION"] = "0" # skip running code coverage for now ## if skip_code_coverage: ## env["SKIP_CODE_COVERAGE"] = "1" ## else: ## env["SKIP_CODE_COVERAGE"] = "0" env["SKIP_CODE_COVERAGE"] = "1" if print_system_info: env["PRINT_SYSTEM_INFO"] = "1" else: env["PRINT_SYSTEM_INFO"] = "0" if print_system_info_only: env["PRINT_SYSTEM_INFO_ONLY"] = "1" else: env["PRINT_SYSTEM_INFO_ONLY"] = "0" if ( skip_requirements_install or os.environ.get("SKIP_REQUIREMENTS_INSTALL", "0") == "1" ): env["SKIP_REQUIREMENTS_INSTALL"] = "1" if "photonos" in name: skip_known_failures = os.environ.get("SKIP_INITIAL_PHOTONOS_FAILURES", "1") env["SKIP_INITIAL_PHOTONOS_FAILURES"] = skip_known_failures if fips: env["FIPS_TESTRUN"] = "1" vm.run(["tdnf", "install", "-y", "openssl-fips-provider"], sudo=True) if envvars: for key in envvars: if key not in os.environ: ctx.warn(f"Environment variable {key!r} not set. Not forwarding") continue env[key] = os.environ[key] returncode = vm.run_nox( nox_session=nox_session, session_args=nox_session_args, env=env, ) ctx.exit(returncode) @vm.command( arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "nox_session": { "flags": [ "-e", "--nox-session", ], "help": "The nox session name to run in the VM", }, "nox_session_args": { "help": "Extra CLI arguments to pass to pytest", "nargs": "*", "metavar": "NOX_SESSION_ARGS", }, "skip_requirements_install": { "help": "Skip requirements installation", "action": "store_true", "flags": [ "--sri", "--skip-requirements-install", ], }, "envvars": { "action": "append", "flags": [ "-E", "--env", ], "help": ( "Environment variable name to forward when running tests. Example: " "'-E VAR1 -E VAR2'." ), }, } ) def testplan( ctx: Context, name: str, nox_session_args: list[str] = None, nox_session: str = "ci-test-3", skip_requirements_install: bool = False, envvars: list[str] = None, ): """ Run test in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) env = { "PRINT_TEST_SELECTION": "1", "PRINT_TEST_PLAN_ONLY": "1", "SKIP_CODE_COVERAGE": "1", "SKIP_INITIAL_GH_ACTIONS_FAILURES": "1", } if ( skip_requirements_install or os.environ.get("SKIP_REQUIREMENTS_INSTALL", "0") == "1" ): env["SKIP_REQUIREMENTS_INSTALL"] = "1" if "photonos" in name: skip_known_failures = os.environ.get("SKIP_INITIAL_PHOTONOS_FAILURES", "1") env["SKIP_INITIAL_PHOTONOS_FAILURES"] = skip_known_failures if envvars: for key in envvars: if key not in os.environ: ctx.warn(f"Environment variable {key!r} not set. Not forwarding") continue env[key] = os.environ[key] returncode = vm.run_nox( nox_session=nox_session, session_args=nox_session_args, env=env, ) ctx.exit(returncode) @vm.command( name="install-dependencies", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "nox_session": { "flags": [ "-e", "--nox-session", ], "help": "The nox environ to run in the VM", }, }, ) def install_dependencies(ctx: Context, name: str, nox_session: str = "ci-test-3"): """ Install test dependencies on VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.install_dependencies(nox_session) ctx.exit(returncode) @vm.command( name="pre-archive-cleanup", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, "pkg": { "help": "Perform extended, pre-packaging cleanup routines", }, }, ) def pre_archive_cleanup(ctx: Context, name: str, pkg: bool = False): """ Pre `.nox` directory compress cleanup. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.run_nox(f"pre-archive-cleanup(pkg={pkg})") ctx.exit(returncode) @vm.command( name="compress-dependencies", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def compress_dependencies(ctx: Context, name: str): """ Compress the .nox/ directory in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.compress_dependencies() ctx.exit(returncode) @vm.command( name="decompress-dependencies", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def decompress_dependencies(ctx: Context, name: str): """ Decompress a dependencies archive into the .nox/ directory in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.decompress_dependencies() ctx.exit(returncode) @vm.command( name="download-dependencies", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def download_dependencies(ctx: Context, name: str): """ Download a compressed .nox/ directory from VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) vm.download_dependencies() @vm.command( name="combine-coverage", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def combine_coverage(ctx: Context, name: str): """ Combine the several code coverage files into a single one in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.combine_coverage() ctx.exit(returncode) @vm.command( name="create-xml-coverage-reports", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def create_xml_coverage_reports(ctx: Context, name: str): """ Create XML code coverage reports in the VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) returncode = vm.create_xml_coverage_reports() ctx.exit(returncode) @vm.command( name="download-artifacts", arguments={ "name": { "help": "The VM Name", "metavar": "VM_NAME", }, }, ) def download_artifacts(ctx: Context, name: str): """ Download test artifacts from VM. """ vm = VM(ctx=ctx, name=name, region_name=ctx.parser.options.region) vm.download_artifacts() @vm.command( name="sync-cache", arguments={ "key_name": { "help": "The SSH key name. Will default to TOOLS_KEY_NAME in environment" }, "delete": { "help": "Delete the entries in the cache that don't align with ec2", "action": "store_true", }, }, ) def sync_cache( ctx: Context, key_name: str = os.environ.get("RUNNER_NAME") or os.environ.get("TOOLS_KEY_NAME"), # type: ignore[assignment] delete: bool = False, ): """ Sync the cache """ ec2_instances = _filter_instances_by_state( _get_instances_by_key(ctx, key_name), {"running"}, ) cached_instances = {} if STATE_DIR.exists(): for state_path in STATE_DIR.iterdir(): try: instance_id = (state_path / "instance-id").read_text() except FileNotFoundError: if not delete: log.info( "Would remove %s (No valid ID) from cache at %s", state_path.name, state_path, ) else: shutil.rmtree(state_path) log.info( "REMOVED %s (No valid ID) from cache at %s", state_path.name, state_path, ) else: cached_instances[instance_id] = state_path.name # Find what instances we are missing in our cached states to_write = {} to_remove = cached_instances.copy() for instance in ec2_instances: if instance.id not in cached_instances: for tag in instance.tags: if tag.get("Key") == "vm-name": to_write[tag.get("Value")] = instance break else: del to_remove[instance.id] for cached_id, vm_name in to_remove.items(): if delete: shutil.rmtree(STATE_DIR / vm_name) log.info( "REMOVED %s (%s) from cache at %s", vm_name, cached_id.strip(), STATE_DIR / vm_name, ) else: log.info( "Would remove %s (%s) from cache at %s", vm_name, cached_id.strip(), STATE_DIR / vm_name, ) if not delete and to_remove: log.info("To force the removal of the above cache entries, pass --delete") for name_tag, vm_instance in to_write.items(): vm_write = VM(ctx=ctx, name=name_tag, region_name=ctx.parser.options.region) vm_write.instance = vm_instance vm_write.write_state() @vm.command( name="list", arguments={ "key_name": { "help": "The SSH key name. Will default to TOOLS_KEY_NAME in environment" }, "states": { "help": "The instance state to filter by.", "flags": ["-s", "-state"], "action": "append", }, }, ) def list_vms( ctx: Context, key_name: str = os.environ.get("RUNNER_NAME") or os.environ.get("TOOLS_KEY_NAME"), # type: ignore[assignment] states: set[str] = None, ): """ List the vms associated with the given key. """ instances = _filter_instances_by_state( _get_instances_by_key(ctx, key_name), states, ) for instance in instances: vm_state = instance.state["Name"] ip_addr = instance.private_ip_address ami = instance.image_id vm_name = None for tag in instance.tags: if tag.get("Key") == "vm-name": vm_name = tag.get("Value") break if vm_name is not None: sep = "\n " extra_info = { "IP": ip_addr, "AMI": ami, } extras = sep + sep.join( [f"{key}: {value}" for key, value in extra_info.items()] ) log.info("%s (%s)%s", vm_name, vm_state, extras) def _get_instances_by_key(ctx: Context, key_name: str): if key_name is None: ctx.exit(1, "We need a key name to filter the instances by.") ec2 = boto3.resource("ec2", region_name=ctx.parser.options.region) # First let's get the instances on AWS associated with the key given filters = [ {"Name": "key-name", "Values": [key_name]}, ] try: instances = list( ec2.instances.filter( Filters=filters, ) ) except ClientError as exc: if "RequestExpired" not in str(exc): raise ctx.error(str(exc)) ctx.exit(1) return instances def _filter_instances_by_state(instances: list[Instance], states: set[str] | None): if states is None: return instances return [instance for instance in instances if instance.state["Name"] in states] @attr.s(frozen=True, kw_only=True) class AMIConfig: ami: str = attr.ib() ssh_username: str = attr.ib() create_timeout: int = attr.ib(default=5 * 60) connect_timeout: int = attr.ib(default=10 * 60) terminate_timeout: int = attr.ib(default=5 * 60) upload_path: str = attr.ib(default=None) @attr.s(slots=True, kw_only=True, hash=True, repr=False) class VM: ctx: Context = attr.ib() name: str = attr.ib() region_name: str = attr.ib(default=None) # Internal config: AMIConfig = attr.ib(init=False) instance: Instance = attr.ib(init=False, hash=False, default=None) state_dir: pathlib.Path = attr.ib(init=False) ssh_config_file: pathlib.Path = attr.ib(init=False) def __attrs_post_init__(self): self.read_state() if self.is_running: self.write_ssh_config() @config.default def _config_default(self): golden_images = tools.utils.get_golden_images() config = AMIConfig( **{ key: value for (key, value) in golden_images[self.name].items() if key in AMIConfig.__annotations__ } ) log.info("Loaded VM Configuration:\n%s", config) return config @state_dir.default def _state_dir_default(self): state_dir = STATE_DIR / self.name state_dir.mkdir(parents=True, exist_ok=True) return state_dir @ssh_config_file.default def _ssh_config_file_default(self): return self.state_dir / "ssh-config" def read_state(self): self.get_ec2_resource.cache_clear() instance = None ec2_region_path = self.state_dir / "ec2-region" if ec2_region_path.exists(): self.region_name = ec2_region_path.read_text().strip() instance_id_path = self.state_dir / "instance-id" if instance_id_path.exists(): instance_id = instance_id_path.read_text().strip() _instance = self.ec2.Instance(instance_id) try: if _instance.state["Name"] == "running": instance = _instance except ClientError as exc: if "InvalidInstanceID.NotFound" not in str(exc): # This machine no longer exists?! self.ctx.error(str(exc)) self.ctx.exit(1) instance_id_path.unlink() except AttributeError: # This machine no longer exists?! instance_id_path.unlink() self.ctx.info("It appears the cached image no longer exists...") self.ctx.exit(1) if not instance_id_path.exists(): filters = [ {"Name": "tag:vm-name", "Values": [self.name]}, {"Name": "tag:instance-client-id", "Values": [REPO_CHECKOUT_ID]}, ] log.info( "Checking existing instance of %s(%s)...", self.name, self.config.ami ) try: instances = list( self.ec2.instances.filter( Filters=filters, ) ) for _instance in instances: if _instance.state["Name"] == "running": instance = _instance break except ClientError as exc: if "RequestExpired" not in str(exc): raise self.ctx.error(str(exc)) self.ctx.exit(1) if instance: self.instance = instance def write_state(self): ec2_region_path = self.state_dir / "ec2-region" if self.region_name: ec2_region_path.write_text(self.region_name) instance_id_path = self.state_dir / "instance-id" if self.id: instance_id_path.write_text(self.id) self.write_ssh_config() def write_ssh_config(self): if self.ssh_config_file.exists(): if ( f"Hostname {self.instance.private_ip_address}" in self.ssh_config_file.read_text() ): # If what's on config matches, then we're good return if os.environ.get("CI") is not None: forward_agent = "no" else: forward_agent = "yes" ciphers = "" if "photonos" in self.name: ciphers = "Ciphers=aes256-gcm@openssh.com,aes256-cbc,aes256-ctr,chacha20-poly1305@openssh.com,aes128-ctr,aes192-ctr,aes128-gcm@openssh.com" ssh_config = textwrap.dedent( f"""\ Host {self.name} Hostname {self.instance.private_ip_address} User {self.config.ssh_username} ControlMaster=no Compression=yes LogLevel=FATAL StrictHostKeyChecking=no UserKnownHostsFile=/dev/null ForwardAgent={forward_agent} PasswordAuthentication=no {ciphers} """ ) self.ssh_config_file.write_text(ssh_config) def create( self, key_name=None, instance_type=None, no_delete=False, environment=None, ): if self.is_running: log.info("%r is already running...", self) return True self.get_ec2_resource.cache_clear() if environment is None: environment = tools.utils.SPB_ENVIRONMENT create_timeout = self.config.create_timeout create_timeout_progress = 0 ssh_connection_timeout = self.config.connect_timeout ssh_connection_timeout_progress = 0 started_in_ci = os.environ.get("RUNNER_NAME") is not None tags = [ {"Key": "vm-name", "Value": self.name}, {"Key": "instance-client-id", "Value": REPO_CHECKOUT_ID}, {"Key": "started-in-ci", "Value": str(started_in_ci).lower()}, {"Key": "no-delete", "Value": str(no_delete).lower()}, ] client = boto3.client("ec2", region_name=self.region_name) # Let's search for the launch template corresponding to this AMI launch_template_name = None next_token = "" try: while True: response = response = client.describe_launch_templates( Filters=[ { "Name": "tag:spb:is-golden-image-template", "Values": ["true"], }, { "Name": "tag:spb:project", "Values": ["salt-project"], }, { "Name": "tag:spb:environment", "Values": [environment], }, { "Name": "tag:spb:image-id", "Values": [self.config.ami], }, ], NextToken=next_token, ) log.debug( "Search for launch template response:\n%s", pprint.pformat(response), ) for details in response.get("LaunchTemplates"): if launch_template_name is not None: log.warning( "Multiple launch templates for the same AMI. This is not " "supposed to happen. Picked the first one listed: %s", response, ) break launch_template_name = details["LaunchTemplateName"] if launch_template_name is not None: break next_token = response.get("NextToken") if next_token: continue self.ctx.error(f"Could not find a launch template for {self.name!r}") self.ctx.exit(1) except ClientError as exc: self.ctx.error(f"Could not find a launch template for {self.name!r}: {exc}") self.ctx.exit(1) try: data = client.describe_launch_template_versions( LaunchTemplateName=launch_template_name ) except ClientError as exc: if "InvalidLaunchTemplateName." not in str(exc): raise self.ctx.error(f"Could not find a launch template for {self.name!r}") self.ctx.exit(1) # The newest template comes first template_data = data["LaunchTemplateVersions"][0]["LaunchTemplateData"] security_group_ids = template_data["SecurityGroupIds"] vpc = None subnets = {} for sg_id in security_group_ids: sg = self.ec2.SecurityGroup(sg_id) vpc = self.ec2.Vpc(sg.vpc_id) for subnet in vpc.subnets.all(): for tag in subnet.tags: if tag["Key"] != "Name": continue private_value = f"-{environment}-vpc-private-" if private_value in tag["Value"]: subnets[subnet.id] = subnet.available_ip_address_count break if subnets: # Let's not process the other security group(s), if any break chosen_subnet, _ = sorted(subnets.items(), reverse=True)[0] network_interfaces = None if started_in_ci: log.info("Starting CI configured VM") else: # This is a developer running log.info( "Starting Developer configured VM In Environment '%s'", environment ) security_group_filters = [ { "Name": "vpc-id", "Values": [vpc.id], }, { "Name": "tag:spb:project", "Values": ["salt-project"], }, ] response = client.describe_security_groups(Filters=security_group_filters) if not response.get("SecurityGroups"): self.ctx.error( "Could not find the right security group for developers. " f"Filters:\n{pprint.pformat(security_group_filters)}" ) self.ctx.exit(1) # Override the launch template network interfaces config security_group_ids = [sg["GroupId"] for sg in response["SecurityGroups"]] security_group_filters = [ { "Name": "vpc-id", "Values": [vpc.id], }, { "Name": "tag:Name", "Values": [f"saltproject-{environment}-client-vpn-remote-access"], }, ] response = client.describe_security_groups(Filters=security_group_filters) if not response.get("SecurityGroups"): self.ctx.error( "Could not find the right VPN access security group. " f"Filters:\n{pprint.pformat(security_group_filters)}" ) self.ctx.exit(1) security_group_ids.extend( [sg["GroupId"] for sg in response["SecurityGroups"]] ) progress = create_progress_bar() create_task = progress.add_task( f"Starting {self!r} in {self.region_name!r} with ssh key named {key_name!r}...", total=create_timeout, ) if os.environ.get("CI") is not None: job = os.environ["GITHUB_JOB"] ref = os.environ["GITHUB_REF"] repo = os.environ["GITHUB_REPOSITORY"] actor = ( os.environ.get("GITHUB_TRIGGERING_ACTOR") or os.environ["GITHUB_ACTOR"] ) if "pull" in ref: ref = f"pr-{ref.split('/')[2]}" elif "tags" in ref: ref = f"tag-{ref.split('/')[-1]}" else: ref = ref.split("/")[-1] tests_chunk = os.environ.get("TESTS_CHUNK") if tests_chunk is None: tests_chunk = "" else: tags.append( { "Key": "TESTS_CHUNK", "Value": tests_chunk, } ) tests_chunk = f" - {tests_chunk}" name = f"{self.name} - {repo} - {ref} - {job}{tests_chunk} - {actor}" for key in os.environ: if not key.startswith("GITHUB_"): continue if key in ( "GITHUB_ACTIONS", "GITHUB_API_URL", "GITHUB_ENV", "GITHUB_EVENT_PATH", "GITHUB_GRAPHQL_URL", "GITHUB_OUTPUT", "GITHUB_PATH", "GITHUB_REPOSITORY_OWNER", "GITHUB_RETENTION_DAYS", "GITHUB_STATE", "GITHUB_STEP_SUMMARY", ): continue value = os.environ.get(key) if not value: continue tags.append( { "Key": f"gh:{key}", "Value": value, } ) else: name = f"{self.name} started on {datetime.utcnow()}" tags.append( { "Key": "Name", "Value": name, } ) with progress: start = time.time() create_kwargs = dict( MinCount=1, MaxCount=1, KeyName=key_name, TagSpecifications=[ { "ResourceType": "instance", "Tags": tags, } ], LaunchTemplate={ "LaunchTemplateName": launch_template_name, }, SecurityGroupIds=security_group_ids, SubnetId=chosen_subnet, ) if instance_type: # The user provided a custom instance type create_kwargs["InstanceType"] = instance_type if network_interfaces is not None: # This is a developer configured VM create_kwargs["NetworkInterfaces"] = network_interfaces # Create the VM try: response = self.ec2.create_instances(**create_kwargs) except ClientError as exc: progress.stop() self.ctx.exit(1, str(exc)) for _instance in response: self.instance = _instance stop = time.time() create_timeout_progress += stop - start progress.update( create_task, description=f"{self!r} created...", completed=create_timeout_progress, ) # Wait until the VM is running while create_timeout_progress <= create_timeout: time.sleep(1) create_timeout_progress += 1 if self.is_running: progress.update( create_task, description=f"{self!r} is running.", completed=create_timeout, ) self.write_state() break progress.update( create_task, description=f"Waiting until {self!r} is running...", completed=create_timeout_progress, ) else: error = f"Failed to create {self!r}" progress.update( create_task, description=error, completed=create_timeout, ) return error # Wait until we can SSH into the VM host = self.instance.private_ip_address progress = create_progress_bar() connect_task = progress.add_task( f"Waiting for SSH to become available at {host} ...", total=ssh_connection_timeout, ) with progress: proc = None checks = 0 last_error = None connection_refused_or_reset = False while ssh_connection_timeout_progress <= ssh_connection_timeout: start = time.time() if proc is None: checks = 0 stderr = None proc = subprocess.Popen( self.ssh_command_args( "exit", "0", log_command_level=logging.DEBUG, ssh_options=[ "-oLogLevel=INFO", "-oConnectTimeout=5", "-oConnectionAttempts=1", ], ), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=False, ) checks += 1 try: wait_start = time.time() proc.wait(timeout=3) progress.update( connect_task, completed=ssh_connection_timeout_progress, description=f"Waiting for SSH to become available at {host} ...", ) if proc.returncode == 0: progress.update( connect_task, description=f"SSH connection to {host} available!", completed=ssh_connection_timeout, ) break proc.wait(timeout=3) stderr = proc.stderr.read().strip() if connection_refused_or_reset is False and ( "connection refused" in stderr.lower() or "connection reset" in stderr.lower() ): connection_refused_or_reset = True if stderr: stderr = f" Last Error: {stderr}" last_error = stderr proc = None if time.time() - wait_start < 1: # Process exited too fast, sleep a little longer time.sleep(5) except KeyboardInterrupt: return except subprocess.TimeoutExpired: pass ssh_connection_timeout_progress += time.time() - start progress.update( connect_task, completed=ssh_connection_timeout_progress, description=f"Waiting for SSH to become available at {host} ...{stderr or ''}", ) if connection_refused_or_reset: # Since ssh is now running, and we're actually getting a connection # refused error message, let's try to ssh a little slower in order not # to get blocked time.sleep(10) if checks >= 10 and proc is not None: proc.kill() proc = None else: error = f"Failed to establish an ssh connection to {host}" if last_error: error += f". {last_error}" return error return True def destroy(self, no_wait: bool = False): try: if not self.is_running: log.info("%r is not running...", self) return timeout = self.config.terminate_timeout timeout_progress = 0.0 progress = create_progress_bar() task = progress.add_task(f"Terminating {self!r}...", total=timeout) self.instance.terminate() try: with progress: while timeout_progress <= timeout: start = time.time() time.sleep(1) if no_wait and not self.is_running: log.info( "%r started the destroy process. Not waiting " "for completion of that process.", self, ) break if self.state == "terminated": progress.update( task, description=f"{self!r} terminated.", completed=timeout, ) break timeout_progress += time.time() - start progress.update( task, description=f"Terminating {self!r}...", completed=timeout_progress, ) else: progress.update( task, description=f"Failed to terminate {self!r}.", completed=timeout, ) except KeyboardInterrupt: pass finally: shutil.rmtree(self.state_dir, ignore_errors=True) self.instance = None def upload_checkout(self, verbose=True, download=False): rsync_flags = [ "--delete", "--no-group", "--no-owner", "--exclude", ".nox/", "--exclude", ".tools-venvs/", "--exclude", ".pytest_cache/", "--exclude", f"{STATE_DIR.relative_to(tools.utils.REPO_ROOT)}{os.path.sep}", "--exclude", "*.py~", # We need to include artifacts/ to be able to include artifacts/salt "--include", "artifacts/", "--include", "artifacts/salt", "--include", "artifacts/pkg", # But we also want to exclude all other entries under artifacts/ "--exclude", "artifacts/*", ] if self.is_windows: # Symlinks aren't handled properly on windows, just replace the # symlink with a copy of what's getting symlinked. rsync_flags.append("--copy-links") # Local repo path source = f"{tools.utils.REPO_ROOT}{os.path.sep}" # Remote repo path remote_path = self.upload_path.as_posix() rsync_remote_path = remote_path if sys.platform == "win32": for drive in ("c:", "C:"): source = source.replace(drive, "/cygdrive/c") source = source.replace("\\", "/") if self.is_windows: for drive in ("c:", "C:"): rsync_remote_path = rsync_remote_path.replace(drive, "/cygdrive/c") destination = f"{self.name}:{rsync_remote_path}" if download: description = "Rsync VM to local checkout..." self.rsync(f"{destination}/*", source, description, rsync_flags) else: description = "Rsync local checkout to VM..." self.rsync(source, destination, description, rsync_flags) if self.is_windows: # rsync sets very strict file permissions and disables inheritance # we only need to reset permissions so they inherit from the parent cmd = ["icacls", remote_path, "/T", "/reset"] ret = self.run(cmd, capture=True, check=False, utf8=False) if ret.returncode != 0: self.ctx.exit(ret.returncode, ret.stderr.strip()) def write_and_upload_dot_env(self, env: dict[str, str]): if not env: return write_env = {k: str(v) for (k, v) in env.items()} write_env["TOOLS_DISTRO_SLUG"] = self.name write_env_filename = ".ci-env" write_env_filepath = tools.utils.REPO_ROOT / ".ci-env" write_env_filepath.write_text(json.dumps(write_env)) # Local path source = str(write_env_filepath) # Remote repo path remote_path = self.upload_path.joinpath(write_env_filename).as_posix() if self.is_windows: for drive in ("c:", "C:"): remote_path = remote_path.replace(drive, "/cygdrive/c") destination = f"{self.name}:{remote_path}" description = f"Uploading {write_env_filename} ..." self.rsync(source, destination, description) write_env_filepath.unlink() def run( self, command: list[str], check: bool = True, sudo: bool = False, capture: bool = False, pseudo_terminal: bool = False, env: list[str] = None, log_command_level: int = logging.INFO, utf8: bool = True, ): if not self.is_running: self.ctx.exit(1, message=f"{self!r} is not running") if env is None: env = [] if utf8: env.append("PYTHONUTF8=1") self.write_ssh_config() try: ssh_command = self.ssh_command_args( *command, sudo=sudo, pseudo_terminal=pseudo_terminal, env=env, log_command_level=log_command_level, ) log.debug("Running %r ...", ssh_command) return self.ctx.run( *ssh_command, check=check, capture=capture, interactive=pseudo_terminal, no_output_timeout_secs=self.ctx.parser.options.no_output_timeout_secs, ) except subprocess.CalledProcessError as exc: log.error(str(exc)) self.ctx.exit(exc.returncode) except (KeyboardInterrupt, SystemExit): pass def run_nox( self, nox_session: str, session_args: list[str] = None, nox_args: list[str] = None, env: dict[str, str] = None, ): cmd = [ "nox", "--force-color", "-f", f"{self.upload_path.joinpath('noxfile.py').as_posix()}", "-e", f'"{nox_session}"', ] if nox_args: cmd += nox_args if session_args: cmd += ["--"] + session_args if env is None: env = {} for key in ("CI", "PIP_INDEX_URL", "PIP_TRUSTED_HOST", "PIP_EXTRA_INDEX_URL"): if key in os.environ: env[key] = os.environ[key] env["PYTHONUTF8"] = "1" env["OUTPUT_COLUMNS"] = str(self.ctx.console.width) env["GITHUB_ACTIONS_PIPELINE"] = "1" env["RAISE_DEPRECATIONS_RUNTIME_ERRORS"] = "1" self.write_and_upload_dot_env(env) if self.is_windows is False and self.config.ssh_username != "root": sudo = True else: sudo = False ret = self.run( cmd, sudo=sudo, check=False, capture=False, pseudo_terminal=True, ) return ret.returncode def combine_coverage(self): """ Combine the code coverage databases """ return self.run_nox("combine-coverage-onedir") def create_xml_coverage_reports(self): """ Create XML coverage reports """ return self.run_nox("create-xml-coverage-reports-onedir") def compress_dependencies(self): """ Compress .nox/ into nox..tar.* in the VM """ platform, arch = tools.utils.get_platform_and_arch_from_slug(self.name) return self.run_nox("compress-dependencies", session_args=[platform, arch]) def decompress_dependencies(self): """ Decompress nox..tar.* if it exists in the VM """ env = {"DELETE_NOX_ARCHIVE": "1"} platform, arch = tools.utils.get_platform_and_arch_from_slug(self.name) return self.run_nox( "decompress-dependencies", session_args=[platform, arch], env=env ) def download_dependencies(self): """ Download nox..tar.* from VM """ if self.is_windows: extension = "tar.gz" else: extension = "tar.xz" platform, arch = tools.utils.get_platform_and_arch_from_slug(self.name) dependencies_filename = f"nox.{platform}.{arch}.{extension}" remote_path = self.upload_path.joinpath(dependencies_filename).as_posix() if self.is_windows: for drive in ("c:", "C:"): remote_path = remote_path.replace(drive, "/cygdrive/c") source = f"{self.name}:{remote_path}" destination = "." description = f"Downloading {dependencies_filename} ..." self.rsync(source, destination, description) def download_artifacts(self): """ Download /artifacts from VM """ remote_path = self.upload_path.joinpath("artifacts").as_posix() if self.is_windows: for drive in ("c:", "C:"): remote_path = remote_path.replace(drive, "/cygdrive/c") source = f"{self.name}:{remote_path}/" destination = "artifacts/" description = f"Downloading {source} ..." self.rsync( source, destination, description, [ "--exclude", f"{remote_path}/artifacts/salt", "--exclude", f"{remote_path}/artifacts/salt-*.*", ], ) def rsync(self, source, destination, description, rsync_flags: list[str] = None): """ Rsync source into destination while showing progress. """ rsync = shutil.which("rsync") if not rsync: self.ctx.exit(1, "Could find the 'rsync' binary") if TYPE_CHECKING: assert rsync ssh_cmd = " ".join( self.ssh_command_args( include_vm_target=False, log_command_level=logging.NOTSET ) ) cmd: list[str] = [ f'"{rsync}"' if sys.platform == "win32" else rsync, "-az", "--info=none,progress2", "-e", f'"{ssh_cmd}"' if sys.platform == "win32" else ssh_cmd, ] if rsync_flags: cmd.extend(rsync_flags) cmd.extend( [ source, destination, ] ) log.info("Running '%s'", " ".join(cmd)) # type: ignore[arg-type] progress = create_progress_bar(transient=True) task = progress.add_task(description, total=100) if sys.platform == "win32": cmd = [" ".join(cmd)] with progress: proc = subprocess.Popen(cmd, bufsize=1, stdout=subprocess.PIPE, text=True) completed = 0 while proc.poll() is None: if TYPE_CHECKING: assert proc.stdout parts = proc.stdout.readline().strip().split() if parts: completed = int(parts[1][:-1]) progress.update(task, completed=completed) progress.update(task, completed=100) def install_dependencies(self, nox_session: str): """ Install test dependencies in VM. """ return self.run_nox( nox_session, nox_args=["--install-only"], env={"PRINT_TEST_SELECTION": "0", "PRINT_SYSTEM_INFO": "0"}, ) def __repr__(self): return ( f"VM(name={self.name!r}, ami={self.config.ami!r}, id={self.id!r}, " f"region={self.region_name!r} state={self.state!r})" ) def ssh_command_args( self, *command: str, sudo: bool = False, include_vm_target: bool = True, pseudo_terminal: bool = False, env: list[str] = None, log_command_level: int = logging.INFO, ssh_options: list[str] | None = None, # pylint: disable=bad-whitespace ) -> list[str]: ssh = shutil.which("ssh") if TYPE_CHECKING: assert ssh _ssh_command_args = [ ssh, "-F", str(self.ssh_config_file.relative_to(tools.utils.REPO_ROOT)), ] if ssh_options: _ssh_command_args.extend(ssh_options) if pseudo_terminal is True: _ssh_command_args.append("-t") if include_vm_target: _ssh_command_args.append(self.name) remote_command = [] if command: remote_command.append("--") if sudo: remote_command.append("sudo") if env: remote_command.append("env") remote_command.extend(env) remote_command.extend(list(command)) log.log( log_command_level, "Running '%s' in %s", " ".join(remote_command[1:]), self.name, ) _ssh_command_args.extend(remote_command) return _ssh_command_args @property def is_windows(self): return "windows" in self.name @lru_cache(maxsize=1) def get_ec2_resource(self): return boto3.resource("ec2", region_name=self.region_name) @property def ec2(self): return self.get_ec2_resource() @property def id(self) -> str | None: if self.is_running: return cast(str, self.instance.id) return None @property def is_running(self) -> bool: if self.instance is None: return False running: bool = self.state == "running" return running @property def state(self) -> str | None: _state: str | None = None if self.instance: try: self.instance.reload() _state = self.instance.state["Name"] except ClientError: pass return _state @property def tempdir(self): return self.get_remote_tempdir() @lru_cache(maxsize=1) def get_remote_tempdir(self): cmd = [ "-c", "import sys,tempfile; sys.stdout.write(tempfile.gettempdir()); sys.stdout.flush();", ] if self.is_windows is False: cmd.insert(0, "python3") else: cmd.insert(0, "python") ret = self.run(cmd, capture=True, check=False) if ret.returncode != 0: self.ctx.exit(ret.returncode, ret.stderr.strip()) return ret.stdout.strip() @property def upload_path(self): return self.get_remote_upload_path() @lru_cache(maxsize=1) def get_remote_upload_path(self): if self.config.upload_path: return pathlib.Path(self.config.upload_path) if self.is_windows: return pathlib.PureWindowsPath(r"c:\Windows\Temp\testing") return pathlib.Path("/tmp/testing") def create_progress_bar(**kwargs): return Progress( TextColumn( "[progress.description]{task.description}", table_column=Column(ratio=3) ), BarColumn(), TaskProgressColumn(), TimeRemainingColumn(), expand=True, **kwargs, )