From 171b8a1313c62f5162f2f9dd784ef8512eb1b795 Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Thu, 10 Jul 2025 15:25:05 -0700 Subject: [PATCH 1/4] fix: sanitize git clone repo input url --- src/sagemaker/git_utils.py | 61 +++++++++- tests/unit/test_git_utils.py | 216 ++++++++++++++++++++++++++++++++++- 2 files changed, 273 insertions(+), 4 deletions(-) diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py index 49d151a00b..51223d7354 100644 --- a/src/sagemaker/git_utils.py +++ b/src/sagemaker/git_utils.py @@ -14,14 +14,69 @@ from __future__ import absolute_import import os -from pathlib import Path +import re import subprocess import tempfile import warnings +from pathlib import Path +from urllib.parse import urlparse + import six from six.moves import urllib +def _sanitize_git_url(repo_url): + """Sanitize Git repository URL to prevent URL injection attacks. + + Args: + repo_url (str): The Git repository URL to sanitize + + Returns: + str: The sanitized URL + + Raises: + ValueError: If the URL contains suspicious patterns that could indicate injection + """ + at_count = repo_url.count('@') + + if repo_url.startswith('git@'): + # git@ format requires exactly one @ + if at_count != 1: + raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol") + elif repo_url.startswith('ssh://'): + # ssh:// format can have 0 or 1 @ symbols + if at_count > 1: + raise ValueError("Invalid SSH URL format: multiple @ symbols detected") + elif repo_url.startswith('https://') or repo_url.startswith('http://'): + # HTTPS format allows 0 or 1 @ symbols + if at_count > 1: + raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected") + + try: + parsed = urlparse(repo_url) + + # Check for suspicious characters in hostname that could indicate injection + if parsed.hostname: + # Check for URL-encoded characters that might be used for obfuscation + suspicious_patterns = ['%25', '%40', '%2F', '%3A'] # encoded %, @, /, : + for pattern in suspicious_patterns: + if pattern in parsed.hostname.lower(): + raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}") + + # Validate that the hostname looks legitimate + if not re.match(r'^[a-zA-Z0-9.-]+$', parsed.hostname): + raise ValueError("Invalid characters in hostname") + + except Exception as e: + if isinstance(e, ValueError): + raise + raise ValueError(f"Failed to parse URL: {str(e)}") + else: + raise ValueError("Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed") + + return repo_url + + def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): """Git clone repo containing the training code and serving code. @@ -87,6 +142,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): if entry_point is None: raise ValueError("Please provide an entry point.") _validate_git_config(git_config) + + # SECURITY: Sanitize the repository URL to prevent injection attacks + git_config["repo"] = _sanitize_git_url(git_config["repo"]) + dest_dir = tempfile.mkdtemp() _generate_and_run_clone_command(git_config, dest_dir) diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 03bbc1ebcd..6cbd2cef88 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -12,11 +12,12 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import pytest import os -from pathlib import Path import subprocess -from mock import patch, ANY +from pathlib import Path + +import pytest +from mock import ANY, patch from sagemaker import git_utils @@ -494,3 +495,212 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte with pytest.raises(subprocess.CalledProcessError) as error: git_utils.git_clone_repo(git_config, entry_point) assert "returned non-zero exit status" in str(error.value) + + +# ============================================================================ +# URL Sanitization Tests - Security vulnerability prevention +# ============================================================================ + +class TestGitUrlSanitization: + """Test cases for Git URL sanitization to prevent injection attacks.""" + + def test_sanitize_git_url_valid_https_urls(self): + """Test that valid HTTPS URLs pass sanitization.""" + valid_urls = [ + "https://github.com/user/repo.git", + "https://gitlab.com/user/repo.git", + "https://token@github.com/user/repo.git", + "https://user:pass@github.com/user/repo.git", + "http://internal-git.company.com/repo.git", + ] + + for url in valid_urls: + # Should not raise any exception + result = git_utils._sanitize_git_url(url) + assert result == url + + def test_sanitize_git_url_valid_ssh_urls(self): + """Test that valid SSH URLs pass sanitization.""" + valid_urls = [ + "git@github.com:user/repo.git", + "git@gitlab.com:user/repo.git", + "ssh://git@github.com/user/repo.git", + "ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/", # 0 @ symbols - valid for ssh:// + "git@internal-git.company.com:repo.git", + ] + + for url in valid_urls: + # Should not raise any exception + result = git_utils._sanitize_git_url(url) + assert result == url + + def test_sanitize_git_url_blocks_multiple_at_https(self): + """Test that HTTPS URLs with multiple @ symbols are blocked.""" + malicious_urls = [ + "https://user@attacker.com@github.com/repo.git", + "https://token@evil.com@gitlab.com/user/repo.git", + "https://a@b@c@github.com/repo.git", + "https://user@malicious-host@github.com/legit/repo.git", + ] + + for url in malicious_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "multiple @ symbols detected" in str(error.value) + + def test_sanitize_git_url_blocks_multiple_at_ssh(self): + """Test that SSH URLs with multiple @ symbols are blocked.""" + malicious_urls = [ + "git@attacker.com@github.com:repo.git", + "git@evil@gitlab.com:user/repo.git", + "ssh://git@malicious@github.com/repo.git", + "git@a@b@c:repo.git", + ] + + for url in malicious_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + # git@ URLs should give "exactly one @ symbol" error + # ssh:// URLs should give "multiple @ symbols detected" error + assert any(phrase in str(error.value) for phrase in [ + "multiple @ symbols detected", + "exactly one @ symbol" + ]) + + def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self): + """Test that invalid schemes and git@ format violations are blocked.""" + # Test unsupported schemes + unsupported_scheme_urls = [ + "git-github.com:user/repo.git", # Doesn't start with git@, ssh://, http://, https:// + ] + + for url in unsupported_scheme_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "Unsupported URL scheme" in str(error.value) + + # Test git@ URLs with wrong @ count + invalid_git_at_urls = [ + "git@github.com@evil.com:repo.git", # 2 @ symbols + ] + + for url in invalid_git_at_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "exactly one @ symbol" in str(error.value) + + def test_sanitize_git_url_blocks_url_encoding_obfuscation(self): + """Test that URL-encoded obfuscation attempts are blocked.""" + obfuscated_urls = [ + "https://github.com%25evil.com/repo.git", + "https://user@github.com%40attacker.com/repo.git", + "https://github.com%2Fevil.com/repo.git", + "https://github.com%3Aevil.com/repo.git", + ] + + for url in obfuscated_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + # The error could be either suspicious encoding or invalid characters + assert any(phrase in str(error.value) for phrase in [ + "Suspicious URL encoding detected", + "Invalid characters in hostname" + ]) + + def test_sanitize_git_url_blocks_invalid_hostname_chars(self): + """Test that hostnames with invalid characters are blocked.""" + invalid_urls = [ + "https://github", + ] + + for url in unsupported_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "Unsupported URL scheme" in str(error.value) + + def test_git_clone_repo_blocks_malicious_https_url(self): + """Test that git_clone_repo blocks malicious HTTPS URLs.""" + malicious_git_config = { + "repo": "https://user@attacker.com@github.com/legit/repo.git", + "branch": "main" + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "multiple @ symbols detected" in str(error.value) + + def test_git_clone_repo_blocks_malicious_ssh_url(self): + """Test that git_clone_repo blocks malicious SSH URLs.""" + malicious_git_config = { + "repo": "git@OBVIOUS@github.com:sage-maker/temp-sev2.git", + "branch": "main" + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "exactly one @ symbol" in str(error.value) + + def test_git_clone_repo_blocks_url_encoded_attack(self): + """Test that git_clone_repo blocks URL-encoded attacks.""" + malicious_git_config = { + "repo": "https://github.com%40attacker.com/repo.git", + "branch": "main" + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "Suspicious URL encoding detected" in str(error.value) + + def test_sanitize_git_url_comprehensive_attack_scenarios(self): + """Test comprehensive attack scenarios from the vulnerability report.""" + # These are the exact attack patterns from the security report + attack_scenarios = [ + # Original PoC attack + "https://USER@YOUR_NGROK_OR_LOCALHOST/malicious.git@github.com%25legit%25repo.git", + # Variations of the attack + "https://user@malicious-host@github.com/legit/repo.git", + "git@attacker.com@github.com:user/repo.git", + "ssh://git@evil.com@github.com/repo.git", + # URL encoding variations + "https://github.com%40evil.com/repo.git", + "https://user@github.com%2Fevil.com/repo.git", + ] + + entry_point = "train.py" + + for malicious_url in attack_scenarios: + git_config = {"repo": malicious_url} + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point) + # Should be blocked by sanitization + assert any(phrase in str(error.value) for phrase in [ + "multiple @ symbols detected", + "exactly one @ symbol", + "Suspicious URL encoding detected", + "Invalid characters in hostname" + ]) From d2d9e193a26e913adf66fa6f5900c330f756bfdd Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Thu, 10 Jul 2025 15:41:05 -0700 Subject: [PATCH 2/4] fix formatting --- src/sagemaker/git_utils.py | 20 +++++---- tests/unit/test_git_utils.py | 81 ++++++++++++++++++++---------------- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py index 51223d7354..24b2c94bd0 100644 --- a/src/sagemaker/git_utils.py +++ b/src/sagemaker/git_utils.py @@ -37,17 +37,17 @@ def _sanitize_git_url(repo_url): Raises: ValueError: If the URL contains suspicious patterns that could indicate injection """ - at_count = repo_url.count('@') + at_count = repo_url.count("@") - if repo_url.startswith('git@'): + if repo_url.startswith("git@"): # git@ format requires exactly one @ if at_count != 1: raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol") - elif repo_url.startswith('ssh://'): + elif repo_url.startswith("ssh://"): # ssh:// format can have 0 or 1 @ symbols if at_count > 1: raise ValueError("Invalid SSH URL format: multiple @ symbols detected") - elif repo_url.startswith('https://') or repo_url.startswith('http://'): + elif repo_url.startswith("https://") or repo_url.startswith("http://"): # HTTPS format allows 0 or 1 @ symbols if at_count > 1: raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected") @@ -58,13 +58,13 @@ def _sanitize_git_url(repo_url): # Check for suspicious characters in hostname that could indicate injection if parsed.hostname: # Check for URL-encoded characters that might be used for obfuscation - suspicious_patterns = ['%25', '%40', '%2F', '%3A'] # encoded %, @, /, : + suspicious_patterns = ["%25", "%40", "%2F", "%3A"] # encoded %, @, /, : for pattern in suspicious_patterns: if pattern in parsed.hostname.lower(): raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}") # Validate that the hostname looks legitimate - if not re.match(r'^[a-zA-Z0-9.-]+$', parsed.hostname): + if not re.match(r"^[a-zA-Z0-9.-]+$", parsed.hostname): raise ValueError("Invalid characters in hostname") except Exception as e: @@ -72,7 +72,9 @@ def _sanitize_git_url(repo_url): raise raise ValueError(f"Failed to parse URL: {str(e)}") else: - raise ValueError("Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed") + raise ValueError( + "Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed" + ) return repo_url @@ -142,10 +144,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): if entry_point is None: raise ValueError("Please provide an entry point.") _validate_git_config(git_config) - + # SECURITY: Sanitize the repository URL to prevent injection attacks git_config["repo"] = _sanitize_git_url(git_config["repo"]) - + dest_dir = tempfile.mkdtemp() _generate_and_run_clone_command(git_config, dest_dir) diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 6cbd2cef88..d48a505067 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -501,6 +501,7 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte # URL Sanitization Tests - Security vulnerability prevention # ============================================================================ + class TestGitUrlSanitization: """Test cases for Git URL sanitization to prevent injection attacks.""" @@ -513,7 +514,7 @@ def test_sanitize_git_url_valid_https_urls(self): "https://user:pass@github.com/user/repo.git", "http://internal-git.company.com/repo.git", ] - + for url in valid_urls: # Should not raise any exception result = git_utils._sanitize_git_url(url) @@ -528,7 +529,7 @@ def test_sanitize_git_url_valid_ssh_urls(self): "ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/", # 0 @ symbols - valid for ssh:// "git@internal-git.company.com:repo.git", ] - + for url in valid_urls: # Should not raise any exception result = git_utils._sanitize_git_url(url) @@ -542,7 +543,7 @@ def test_sanitize_git_url_blocks_multiple_at_https(self): "https://a@b@c@github.com/repo.git", "https://user@malicious-host@github.com/legit/repo.git", ] - + for url in malicious_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) @@ -556,16 +557,16 @@ def test_sanitize_git_url_blocks_multiple_at_ssh(self): "ssh://git@malicious@github.com/repo.git", "git@a@b@c:repo.git", ] - + for url in malicious_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) # git@ URLs should give "exactly one @ symbol" error # ssh:// URLs should give "multiple @ symbols detected" error - assert any(phrase in str(error.value) for phrase in [ - "multiple @ symbols detected", - "exactly one @ symbol" - ]) + assert any( + phrase in str(error.value) + for phrase in ["multiple @ symbols detected", "exactly one @ symbol"] + ) def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self): """Test that invalid schemes and git@ format violations are blocked.""" @@ -573,17 +574,17 @@ def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self): unsupported_scheme_urls = [ "git-github.com:user/repo.git", # Doesn't start with git@, ssh://, http://, https:// ] - + for url in unsupported_scheme_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) assert "Unsupported URL scheme" in str(error.value) - + # Test git@ URLs with wrong @ count invalid_git_at_urls = [ "git@github.com@evil.com:repo.git", # 2 @ symbols ] - + for url in invalid_git_at_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) @@ -597,15 +598,15 @@ def test_sanitize_git_url_blocks_url_encoding_obfuscation(self): "https://github.com%2Fevil.com/repo.git", "https://github.com%3Aevil.com/repo.git", ] - + for url in obfuscated_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) # The error could be either suspicious encoding or invalid characters - assert any(phrase in str(error.value) for phrase in [ - "Suspicious URL encoding detected", - "Invalid characters in hostname" - ]) + assert any( + phrase in str(error.value) + for phrase in ["Suspicious URL encoding detected", "Invalid characters in hostname"] + ) def test_sanitize_git_url_blocks_invalid_hostname_chars(self): """Test that hostnames with invalid characters are blocked.""" @@ -615,16 +616,19 @@ def test_sanitize_git_url_blocks_invalid_hostname_chars(self): "https://github[].com/repo.git", "https://github{}.com/repo.git", ] - + for url in invalid_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) # The error could be various types due to URL parsing edge cases - assert any(phrase in str(error.value) for phrase in [ - "Invalid characters in hostname", - "Failed to parse URL", - "does not appear to be an IPv4 or IPv6 address" - ]) + assert any( + phrase in str(error.value) + for phrase in [ + "Invalid characters in hostname", + "Failed to parse URL", + "does not appear to be an IPv4 or IPv6 address", + ] + ) def test_sanitize_git_url_blocks_unsupported_schemes(self): """Test that unsupported URL schemes are blocked.""" @@ -634,7 +638,7 @@ def test_sanitize_git_url_blocks_unsupported_schemes(self): "javascript:alert('xss')", "data:text/html,", ] - + for url in unsupported_urls: with pytest.raises(ValueError) as error: git_utils._sanitize_git_url(url) @@ -644,10 +648,10 @@ def test_git_clone_repo_blocks_malicious_https_url(self): """Test that git_clone_repo blocks malicious HTTPS URLs.""" malicious_git_config = { "repo": "https://user@attacker.com@github.com/legit/repo.git", - "branch": "main" + "branch": "main", } entry_point = "train.py" - + with pytest.raises(ValueError) as error: git_utils.git_clone_repo(malicious_git_config, entry_point) assert "multiple @ symbols detected" in str(error.value) @@ -656,10 +660,10 @@ def test_git_clone_repo_blocks_malicious_ssh_url(self): """Test that git_clone_repo blocks malicious SSH URLs.""" malicious_git_config = { "repo": "git@OBVIOUS@github.com:sage-maker/temp-sev2.git", - "branch": "main" + "branch": "main", } entry_point = "train.py" - + with pytest.raises(ValueError) as error: git_utils.git_clone_repo(malicious_git_config, entry_point) assert "exactly one @ symbol" in str(error.value) @@ -668,10 +672,10 @@ def test_git_clone_repo_blocks_url_encoded_attack(self): """Test that git_clone_repo blocks URL-encoded attacks.""" malicious_git_config = { "repo": "https://github.com%40attacker.com/repo.git", - "branch": "main" + "branch": "main", } entry_point = "train.py" - + with pytest.raises(ValueError) as error: git_utils.git_clone_repo(malicious_git_config, entry_point) assert "Suspicious URL encoding detected" in str(error.value) @@ -690,17 +694,20 @@ def test_sanitize_git_url_comprehensive_attack_scenarios(self): "https://github.com%40evil.com/repo.git", "https://user@github.com%2Fevil.com/repo.git", ] - + entry_point = "train.py" - + for malicious_url in attack_scenarios: git_config = {"repo": malicious_url} with pytest.raises(ValueError) as error: git_utils.git_clone_repo(git_config, entry_point) # Should be blocked by sanitization - assert any(phrase in str(error.value) for phrase in [ - "multiple @ symbols detected", - "exactly one @ symbol", - "Suspicious URL encoding detected", - "Invalid characters in hostname" - ]) + assert any( + phrase in str(error.value) + for phrase in [ + "multiple @ symbols detected", + "exactly one @ symbol", + "Suspicious URL encoding detected", + "Invalid characters in hostname", + ] + ) From 38936f7c84ef5e7481b3198d37a5a827a25fbcb0 Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Thu, 10 Jul 2025 17:37:29 -0700 Subject: [PATCH 3/4] Remove references --- tests/unit/test_git_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index d48a505067..2d10ac7619 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -497,11 +497,6 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte assert "returned non-zero exit status" in str(error.value) -# ============================================================================ -# URL Sanitization Tests - Security vulnerability prevention -# ============================================================================ - - class TestGitUrlSanitization: """Test cases for Git URL sanitization to prevent injection attacks.""" @@ -681,8 +676,6 @@ def test_git_clone_repo_blocks_url_encoded_attack(self): assert "Suspicious URL encoding detected" in str(error.value) def test_sanitize_git_url_comprehensive_attack_scenarios(self): - """Test comprehensive attack scenarios from the vulnerability report.""" - # These are the exact attack patterns from the security report attack_scenarios = [ # Original PoC attack "https://USER@YOUR_NGROK_OR_LOCALHOST/malicious.git@github.com%25legit%25repo.git", From ad08cb421c2b2a2c4071a603fbd83c36f398c9be Mon Sep 17 00:00:00 2001 From: Nathan Park Date: Thu, 10 Jul 2025 17:48:52 -0700 Subject: [PATCH 4/4] Fix tests --- src/sagemaker/git_utils.py | 7 +++++++ tests/unit/test_estimator.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py index 24b2c94bd0..25e745446a 100644 --- a/src/sagemaker/git_utils.py +++ b/src/sagemaker/git_utils.py @@ -52,6 +52,13 @@ def _sanitize_git_url(repo_url): if at_count > 1: raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected") + # Check for invalid characters in the URL before parsing + # These characters should not appear in legitimate URLs + invalid_chars = ["<", ">", "[", "]", "{", "}", "\\", "^", "`", "|"] + for char in invalid_chars: + if char in repo_url: + raise ValueError("Invalid characters in hostname") + try: parsed = urlparse(repo_url) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 11cc83a463..cfb243b563 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -2794,7 +2794,7 @@ def test_git_support_bad_repo_url_format(sagemaker_session): ) with pytest.raises(ValueError) as error: fw.fit() - assert "Invalid Git url provided." in str(error) + assert "Unsupported URL scheme" in str(error) @patch(