Skip to content

Commit 5878547

Browse files
committed
fix: sanitize git clone repo input url
For commit: aws/sagemaker-python-sdk-staging@ed143b7
1 parent 0e84f33 commit 5878547

File tree

2 files changed

+264
-0
lines changed

2 files changed

+264
-0
lines changed

sagemaker-core/src/sagemaker/core/git_utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,69 @@
2020
import warnings
2121
import six
2222
from six.moves import urllib
23+
import re
24+
from pathlib import Path
25+
from urllib.parse import urlparse
26+
27+
def _sanitize_git_url(repo_url):
28+
"""Sanitize Git repository URL to prevent URL injection attacks.
29+
30+
Args:
31+
repo_url (str): The Git repository URL to sanitize
2332
33+
Returns:
34+
str: The sanitized URL
35+
36+
Raises:
37+
ValueError: If the URL contains suspicious patterns that could indicate injection
38+
"""
39+
at_count = repo_url.count("@")
40+
41+
if repo_url.startswith("git@"):
42+
# git@ format requires exactly one @
43+
if at_count != 1:
44+
raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol")
45+
elif repo_url.startswith("ssh://"):
46+
# ssh:// format can have 0 or 1 @ symbols
47+
if at_count > 1:
48+
raise ValueError("Invalid SSH URL format: multiple @ symbols detected")
49+
elif repo_url.startswith("https://") or repo_url.startswith("http://"):
50+
# HTTPS format allows 0 or 1 @ symbols
51+
if at_count > 1:
52+
raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected")
53+
54+
# Check for invalid characters in the URL before parsing
55+
# These characters should not appear in legitimate URLs
56+
invalid_chars = ["<", ">", "[", "]", "{", "}", "\\", "^", "`", "|"]
57+
for char in invalid_chars:
58+
if char in repo_url:
59+
raise ValueError("Invalid characters in hostname")
60+
61+
try:
62+
parsed = urlparse(repo_url)
63+
64+
# Check for suspicious characters in hostname that could indicate injection
65+
if parsed.hostname:
66+
# Check for URL-encoded characters that might be used for obfuscation
67+
suspicious_patterns = ["%25", "%40", "%2F", "%3A"] # encoded %, @, /, :
68+
for pattern in suspicious_patterns:
69+
if pattern in parsed.hostname.lower():
70+
raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}")
71+
72+
# Validate that the hostname looks legitimate
73+
if not re.match(r"^[a-zA-Z0-9.-]+$", parsed.hostname):
74+
raise ValueError("Invalid characters in hostname")
75+
76+
except Exception as e:
77+
if isinstance(e, ValueError):
78+
raise
79+
raise ValueError(f"Failed to parse URL: {str(e)}")
80+
else:
81+
raise ValueError(
82+
"Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed"
83+
)
84+
85+
return repo_url
2486

2587
def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
2688
"""Git clone repo containing the training code and serving code.
@@ -87,6 +149,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
87149
if entry_point is None:
88150
raise ValueError("Please provide an entry point.")
89151
_validate_git_config(git_config)
152+
153+
# SECURITY: Sanitize the repository URL to prevent injection attacks
154+
git_config["repo"] = _sanitize_git_url(git_config["repo"])
155+
90156
dest_dir = tempfile.mkdtemp()
91157
_generate_and_run_clone_command(git_config, dest_dir)
92158

sagemaker-core/tests/unit/test_git_utils.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import pytest
1616

17+
from sagemaker.core import git_utils
1718
from sagemaker.core.git_utils import _validate_git_config
1819

1920

@@ -135,3 +136,200 @@ def test_validate_git_config_repo_none():
135136

136137
with pytest.raises(ValueError, match="'repo' must be a string"):
137138
_validate_git_config(git_config)
139+
140+
141+
class TestGitUrlSanitization:
142+
"""Test cases for Git URL sanitization to prevent injection attacks."""
143+
144+
def test_sanitize_git_url_valid_https_urls(self):
145+
"""Test that valid HTTPS URLs pass sanitization."""
146+
valid_urls = [
147+
"https://github.com/user/repo.git",
148+
"https://gitlab.com/user/repo.git",
149+
"https://token@github.com/user/repo.git",
150+
"https://user:pass@github.com/user/repo.git",
151+
"http://internal-git.company.com/repo.git",
152+
]
153+
154+
for url in valid_urls:
155+
result = git_utils._sanitize_git_url(url)
156+
assert result == url
157+
158+
def test_sanitize_git_url_valid_ssh_urls(self):
159+
"""Test that valid SSH URLs pass sanitization."""
160+
valid_urls = [
161+
"git@github.com:user/repo.git",
162+
"git@gitlab.com:user/repo.git",
163+
"ssh://git@github.com/user/repo.git",
164+
"ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/",
165+
"git@internal-git.company.com:repo.git",
166+
]
167+
168+
for url in valid_urls:
169+
result = git_utils._sanitize_git_url(url)
170+
assert result == url
171+
172+
def test_sanitize_git_url_blocks_multiple_at_https(self):
173+
"""Test that HTTPS URLs with multiple @ symbols are blocked."""
174+
malicious_urls = [
175+
"https://user@attacker.com@github.com/repo.git",
176+
"https://token@evil.com@gitlab.com/user/repo.git",
177+
"https://a@b@c@github.com/repo.git",
178+
"https://user@malicious-host@github.com/legit/repo.git",
179+
]
180+
181+
for url in malicious_urls:
182+
with pytest.raises(ValueError) as error:
183+
git_utils._sanitize_git_url(url)
184+
assert "multiple @ symbols detected" in str(error.value)
185+
186+
def test_sanitize_git_url_blocks_multiple_at_ssh(self):
187+
"""Test that SSH URLs with multiple @ symbols are blocked."""
188+
malicious_urls = [
189+
"git@attacker.com@github.com:repo.git",
190+
"git@evil@gitlab.com:user/repo.git",
191+
"ssh://git@malicious@github.com/repo.git",
192+
"git@a@b@c:repo.git",
193+
]
194+
195+
for url in malicious_urls:
196+
with pytest.raises(ValueError) as error:
197+
git_utils._sanitize_git_url(url)
198+
assert any(
199+
phrase in str(error.value)
200+
for phrase in ["multiple @ symbols detected", "exactly one @ symbol"]
201+
)
202+
203+
def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self):
204+
"""Test that invalid schemes and git@ format violations are blocked."""
205+
unsupported_scheme_urls = [
206+
"git-github.com:user/repo.git",
207+
]
208+
209+
for url in unsupported_scheme_urls:
210+
with pytest.raises(ValueError) as error:
211+
git_utils._sanitize_git_url(url)
212+
assert "Unsupported URL scheme" in str(error.value)
213+
214+
invalid_git_at_urls = [
215+
"git@github.com@evil.com:repo.git",
216+
]
217+
218+
for url in invalid_git_at_urls:
219+
with pytest.raises(ValueError) as error:
220+
git_utils._sanitize_git_url(url)
221+
assert "exactly one @ symbol" in str(error.value)
222+
223+
def test_sanitize_git_url_blocks_url_encoding_obfuscation(self):
224+
"""Test that URL-encoded obfuscation attempts are blocked."""
225+
obfuscated_urls = [
226+
"https://github.com%25evil.com/repo.git",
227+
"https://user@github.com%40attacker.com/repo.git",
228+
"https://github.com%2Fevil.com/repo.git",
229+
"https://github.com%3Aevil.com/repo.git",
230+
]
231+
232+
for url in obfuscated_urls:
233+
with pytest.raises(ValueError) as error:
234+
git_utils._sanitize_git_url(url)
235+
assert any(
236+
phrase in str(error.value)
237+
for phrase in ["Suspicious URL encoding detected", "Invalid characters in hostname"]
238+
)
239+
240+
def test_sanitize_git_url_blocks_invalid_hostname_chars(self):
241+
"""Test that hostnames with invalid characters are blocked."""
242+
invalid_urls = [
243+
"https://github<script>.com/repo.git",
244+
"https://github>.com/repo.git",
245+
"https://github[].com/repo.git",
246+
"https://github{}.com/repo.git",
247+
]
248+
249+
for url in invalid_urls:
250+
with pytest.raises(ValueError) as error:
251+
git_utils._sanitize_git_url(url)
252+
assert any(
253+
phrase in str(error.value)
254+
for phrase in [
255+
"Invalid characters in hostname",
256+
"Failed to parse URL",
257+
"does not appear to be an IPv4 or IPv6 address",
258+
]
259+
)
260+
261+
def test_sanitize_git_url_blocks_unsupported_schemes(self):
262+
"""Test that unsupported URL schemes are blocked."""
263+
unsupported_urls = [
264+
"ftp://github.com/repo.git",
265+
"file:///local/repo.git",
266+
"javascript:alert('xss')",
267+
"data:text/html,<script>alert('xss')</script>",
268+
]
269+
270+
for url in unsupported_urls:
271+
with pytest.raises(ValueError) as error:
272+
git_utils._sanitize_git_url(url)
273+
assert "Unsupported URL scheme" in str(error.value)
274+
275+
def test_git_clone_repo_blocks_malicious_https_url(self):
276+
"""Test that git_clone_repo blocks malicious HTTPS URLs."""
277+
malicious_git_config = {
278+
"repo": "https://user@attacker.com@github.com/legit/repo.git",
279+
"branch": "main",
280+
}
281+
entry_point = "train.py"
282+
283+
with pytest.raises(ValueError) as error:
284+
git_utils.git_clone_repo(malicious_git_config, entry_point)
285+
assert "multiple @ symbols detected" in str(error.value)
286+
287+
def test_git_clone_repo_blocks_malicious_ssh_url(self):
288+
"""Test that git_clone_repo blocks malicious SSH URLs."""
289+
malicious_git_config = {
290+
"repo": "git@OBVIOUS@github.com:sage-maker/temp-sev2.git",
291+
"branch": "main",
292+
}
293+
entry_point = "train.py"
294+
295+
with pytest.raises(ValueError) as error:
296+
git_utils.git_clone_repo(malicious_git_config, entry_point)
297+
assert "exactly one @ symbol" in str(error.value)
298+
299+
def test_git_clone_repo_blocks_url_encoded_attack(self):
300+
"""Test that git_clone_repo blocks URL-encoded attacks."""
301+
malicious_git_config = {
302+
"repo": "https://github.com%40attacker.com/repo.git",
303+
"branch": "main",
304+
}
305+
entry_point = "train.py"
306+
307+
with pytest.raises(ValueError) as error:
308+
git_utils.git_clone_repo(malicious_git_config, entry_point)
309+
assert "Suspicious URL encoding detected" in str(error.value)
310+
311+
def test_sanitize_git_url_comprehensive_attack_scenarios(self):
312+
attack_scenarios = [
313+
"https://USER@YOUR_NGROK_OR_LOCALHOST/malicious.git@github.com%25legit%25repo.git",
314+
"https://user@malicious-host@github.com/legit/repo.git",
315+
"git@attacker.com@github.com:user/repo.git",
316+
"ssh://git@evil.com@github.com/repo.git",
317+
"https://github.com%40evil.com/repo.git",
318+
"https://user@github.com%2Fevil.com/repo.git",
319+
]
320+
321+
entry_point = "train.py"
322+
323+
for malicious_url in attack_scenarios:
324+
git_config = {"repo": malicious_url}
325+
with pytest.raises(ValueError) as error:
326+
git_utils.git_clone_repo(git_config, entry_point)
327+
assert any(
328+
phrase in str(error.value)
329+
for phrase in [
330+
"multiple @ symbols detected",
331+
"exactly one @ symbol",
332+
"Suspicious URL encoding detected",
333+
"Invalid characters in hostname",
334+
]
335+
)

0 commit comments

Comments
 (0)