Skip to content

Commit b34d1d2

Browse files
CopilotByron
andcommitted
feat: Replace url dependency with minimal custom implementation
Co-authored-by: Byron <63622+Byron@users.noreply.github.com>
1 parent 422c5b5 commit b34d1d2

File tree

7 files changed

+317
-26
lines changed

7 files changed

+317
-26
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gix-url/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ gix-path = { version = "^0.10.21", path = "../gix-path" }
2424

2525
serde = { version = "1.0.114", optional = true, default-features = false, features = ["std", "derive"] }
2626
thiserror = "2.0.17"
27-
url = "2.5.2"
2827
bstr = { version = "1.12.0", default-features = false, features = ["std"] }
2928
percent-encoding = "2.3.1"
3029

gix-url/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ mod impls;
2222
///
2323
pub mod parse;
2424

25+
/// Minimal URL parser to replace the `url` crate dependency
26+
mod simple_url;
27+
2528
/// Parse the given `bytes` as a [git url](Url).
2629
///
2730
/// # Note

gix-url/src/parse.rs

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub enum Error {
1919
Url {
2020
url: String,
2121
kind: UrlKind,
22-
source: url::ParseError,
22+
source: crate::simple_url::ParseError,
2323
},
2424

2525
#[error("The host portion of the following URL is too long ({} bytes, {len} bytes total): {truncated_url:?}", truncated_url.len())]
@@ -112,6 +112,13 @@ pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error
112112
return Err(Error::RelativeUrl { url: input.to_owned() });
113113
}
114114

115+
// Normalize empty path to "/" for http/https URLs only
116+
let path = if url.path().is_empty() && matches!(scheme, Scheme::Http | Scheme::Https) {
117+
"/".into()
118+
} else {
119+
url.path().into()
120+
};
121+
115122
Ok(crate::Url {
116123
serialize_alternative_form: false,
117124
scheme,
@@ -122,7 +129,7 @@ pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error
122129
.transpose()?,
123130
host: url.host_str().map(Into::into),
124131
port: url.port(),
125-
path: url.path().into(),
132+
path,
126133
})
127134
}
128135

@@ -156,7 +163,8 @@ pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
156163
// should never differ in any other way (ssh URLs should not contain a query or fragment part).
157164
// To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
158165
// determined above and can therefore skip parsing it here as well.
159-
let url = url::Url::parse(&format!("ssh://{host}")).map_err(|source| Error::Url {
166+
let url_string = format!("ssh://{host}");
167+
let url = crate::simple_url::ParsedUrl::parse(&url_string).map_err(|source| Error::Url {
160168
url: input.to_owned(),
161169
kind: UrlKind::Scp,
162170
source,
@@ -176,7 +184,7 @@ pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
176184
})
177185
}
178186

179-
fn url_user(url: &url::Url, kind: UrlKind) -> Result<Option<String>, Error> {
187+
fn url_user(url: &crate::simple_url::ParsedUrl<'_>, kind: UrlKind) -> Result<Option<String>, Error> {
180188
if url.username().is_empty() && url.password().is_none() {
181189
Ok(None)
182190
} else {
@@ -269,13 +277,20 @@ fn input_to_utf8(input: &BStr, kind: UrlKind) -> Result<&str, Error> {
269277
})
270278
}
271279

272-
fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, url::Url), Error> {
280+
fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, crate::simple_url::ParsedUrl<'_>), Error> {
273281
let input = input_to_utf8(input, kind)?;
274-
url::Url::parse(input)
282+
crate::simple_url::ParsedUrl::parse(input)
275283
.map(|url| (input, url))
276-
.map_err(|source| Error::Url {
277-
url: input.to_owned(),
278-
kind,
279-
source,
284+
.map_err(|source| {
285+
// If the parser rejected it as RelativeUrlWithoutBase, map to Error::RelativeUrl
286+
// to match the expected error type for malformed URLs like "invalid:://"
287+
match source {
288+
crate::simple_url::ParseError::RelativeUrlWithoutBase => Error::RelativeUrl { url: input.to_owned() },
289+
_ => Error::Url {
290+
url: input.to_owned(),
291+
kind,
292+
source,
293+
},
294+
}
280295
})
281296
}

gix-url/src/simple_url.rs

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
/// A minimal URL parser that extracts only what we need for git URLs.
2+
/// This is a replacement for the `url` crate dependency.
3+
#[derive(Debug)]
4+
pub(crate) struct ParsedUrl<'a> {
5+
scheme: String, // Owned to allow normalization to lowercase
6+
username: &'a str,
7+
password: Option<&'a str>,
8+
host: Option<String>, // Owned to allow normalization to lowercase
9+
port: Option<u16>,
10+
path: &'a str,
11+
}
12+
13+
/// Minimal parse error type to replace url::ParseError
14+
#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
15+
pub enum ParseError {
16+
#[error("relative URL without a base")]
17+
RelativeUrlWithoutBase,
18+
#[error("invalid port number")]
19+
InvalidPort,
20+
#[error("invalid domain character")]
21+
InvalidDomainCharacter,
22+
}
23+
24+
/// Check if a character is valid in a URL scheme.
25+
/// Valid scheme characters: alphanumeric, +, -, or .
26+
fn is_valid_scheme_char(c: char) -> bool {
27+
c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.'
28+
}
29+
30+
impl<'a> ParsedUrl<'a> {
31+
/// Parse a URL string into its components.
32+
/// Expected format: scheme://[user[:password]@]host[:port]/path
33+
pub(crate) fn parse(input: &'a str) -> Result<Self, ParseError> {
34+
// Find scheme by looking for first ':'
35+
let first_colon = input.find(':').ok_or(ParseError::RelativeUrlWithoutBase)?;
36+
37+
let scheme_str = &input[..first_colon];
38+
// Normalize scheme to lowercase for case-insensitive matching (matches url crate behavior)
39+
let scheme = scheme_str.to_ascii_lowercase();
40+
41+
// Verify it's followed by "//"
42+
if !input[first_colon..].starts_with("://") {
43+
return Err(ParseError::RelativeUrlWithoutBase);
44+
}
45+
46+
// Start after "://"
47+
let after_scheme = &input[first_colon + 3..];
48+
49+
// Check for relative URL (scheme without proper authority)
50+
if scheme_str.is_empty() {
51+
return Err(ParseError::RelativeUrlWithoutBase);
52+
}
53+
54+
// Validate scheme characters (check original before lowercase conversion)
55+
if !scheme_str.chars().all(is_valid_scheme_char) {
56+
return Err(ParseError::RelativeUrlWithoutBase);
57+
}
58+
59+
// Find path start (first '/' after scheme)
60+
let path_start = after_scheme.find('/').unwrap_or(after_scheme.len());
61+
let authority = &after_scheme[..path_start];
62+
let path = if path_start < after_scheme.len() {
63+
&after_scheme[path_start..]
64+
} else {
65+
// No path specified - leave empty (caller can default to / if needed)
66+
""
67+
};
68+
69+
// Parse authority: [user[:password]@]host[:port]
70+
let (username, password, host, port) = if let Some(at_pos) = authority.rfind('@') {
71+
// Has user info
72+
let user_info = &authority[..at_pos];
73+
let host_port = &authority[at_pos + 1..];
74+
75+
let (user, pass) = if let Some(colon_pos) = user_info.find(':') {
76+
let pass_str = &user_info[colon_pos + 1..];
77+
// Treat empty password as None
78+
let pass = if pass_str.is_empty() { None } else { Some(pass_str) };
79+
(&user_info[..colon_pos], pass)
80+
} else {
81+
(user_info, None)
82+
};
83+
84+
let (h, p) = Self::parse_host_port(host_port)?;
85+
// If we have user info, we must have a host
86+
if h.is_none() {
87+
return Err(ParseError::InvalidDomainCharacter);
88+
}
89+
(user, pass, h, p)
90+
} else {
91+
// No user info
92+
let (h, p) = Self::parse_host_port(authority)?;
93+
("", None, h, p)
94+
};
95+
96+
// Standard schemes (http, https, git, ssh) require a host
97+
// Scheme is already lowercase at this point
98+
let requires_host = matches!(scheme.as_str(), "http" | "https" | "git" | "ssh" | "ftp" | "ftps");
99+
if requires_host && host.is_none() {
100+
return Err(ParseError::InvalidDomainCharacter);
101+
}
102+
103+
Ok(ParsedUrl {
104+
scheme,
105+
username,
106+
password,
107+
host,
108+
port,
109+
path,
110+
})
111+
}
112+
113+
fn parse_host_port(host_port: &str) -> Result<(Option<String>, Option<u16>), ParseError> {
114+
if host_port.is_empty() {
115+
return Ok((None, None));
116+
}
117+
118+
// Handle IPv6 addresses: [::1] or [::1]:port
119+
if host_port.starts_with('[') {
120+
if let Some(bracket_end) = host_port.find(']') {
121+
// IPv6 addresses are case-insensitive, normalize to lowercase
122+
let host = Some(host_port[..bracket_end + 1].to_ascii_lowercase());
123+
let remaining = &host_port[bracket_end + 1..];
124+
125+
if remaining.is_empty() {
126+
return Ok((host, None));
127+
} else if let Some(port_str) = remaining.strip_prefix(':') {
128+
let port = port_str.parse::<u16>().map_err(|_| ParseError::InvalidPort)?;
129+
// Validate port is in valid range (1-65535, port 0 is invalid)
130+
if port == 0 {
131+
return Err(ParseError::InvalidPort);
132+
}
133+
return Ok((host, Some(port)));
134+
} else {
135+
return Err(ParseError::InvalidDomainCharacter);
136+
}
137+
} else {
138+
return Err(ParseError::InvalidDomainCharacter);
139+
}
140+
}
141+
142+
// Handle regular host:port
143+
// Use rfind to handle IPv6 addresses without brackets (edge case)
144+
if let Some(colon_pos) = host_port.rfind(':') {
145+
// Check if this looks like a port (all digits after colon)
146+
let potential_port = &host_port[colon_pos + 1..];
147+
if potential_port.is_empty() {
148+
// Empty port like "host:" - strip the trailing colon
149+
let host_str = &host_port[..colon_pos];
150+
return Ok((Some(Self::normalize_hostname(host_str)?), None));
151+
} else if potential_port.chars().all(|c| c.is_ascii_digit()) {
152+
let host_str = &host_port[..colon_pos];
153+
let host = Self::normalize_hostname(host_str)?;
154+
let port = potential_port.parse::<u16>().map_err(|_| ParseError::InvalidPort)?;
155+
// Validate port is in valid range (1-65535, port 0 is invalid)
156+
if port == 0 {
157+
return Err(ParseError::InvalidPort);
158+
}
159+
return Ok((Some(host), Some(port)));
160+
}
161+
}
162+
163+
// No port, just host
164+
Ok((Some(Self::normalize_hostname(host_port)?), None))
165+
}
166+
167+
/// Check if a string looks like a valid DNS hostname (for normalization purposes)
168+
/// Valid DNS names contain only alphanumeric, hyphens, dots, underscores, and wildcards
169+
fn is_normalizable_hostname(host: &str) -> bool {
170+
// Allow alphanumeric, -, ., _, and * (for patterns)
171+
host.chars()
172+
.all(|c| c.is_ascii_alphanumeric() || matches!(c, '-' | '.' | '_' | '*'))
173+
}
174+
175+
/// Validate and possibly normalize a hostname
176+
/// Valid DNS hostnames are normalized to lowercase
177+
/// Invalid strings (like injection attempts) are preserved as-is but ? is rejected
178+
fn normalize_hostname(host: &str) -> Result<String, ParseError> {
179+
// Reject ? character which git's url parser rejects
180+
if host.contains('?') {
181+
return Err(ParseError::InvalidDomainCharacter);
182+
}
183+
184+
// Only normalize if it looks like a valid DNS hostname
185+
// Preserve case for security checks if it contains special characters
186+
if Self::is_normalizable_hostname(host) {
187+
Ok(host.to_ascii_lowercase())
188+
} else {
189+
Ok(host.to_owned())
190+
}
191+
}
192+
193+
pub(crate) fn scheme(&self) -> &str {
194+
&self.scheme
195+
}
196+
197+
pub(crate) fn username(&self) -> &str {
198+
self.username
199+
}
200+
201+
pub(crate) fn password(&self) -> Option<&str> {
202+
self.password
203+
}
204+
205+
pub(crate) fn host_str(&self) -> Option<&str> {
206+
self.host.as_deref()
207+
}
208+
209+
pub(crate) fn port(&self) -> Option<u16> {
210+
self.port
211+
}
212+
213+
pub(crate) fn path(&self) -> &str {
214+
self.path
215+
}
216+
217+
/// Check if this URL cannot be a base (is relative).
218+
/// For our purposes, since we only parse URLs with "://", they can all be a base.
219+
pub(crate) fn cannot_be_a_base(&self) -> bool {
220+
false
221+
}
222+
}
223+
224+
#[cfg(test)]
225+
mod tests {
226+
use super::*;
227+
228+
#[test]
229+
fn test_simple_url() {
230+
let url = ParsedUrl::parse("http://example.com/path").unwrap();
231+
assert_eq!(url.scheme(), "http");
232+
assert_eq!(url.host_str(), Some("example.com"));
233+
assert_eq!(url.path(), "/path");
234+
assert_eq!(url.username(), "");
235+
assert_eq!(url.password(), None);
236+
assert_eq!(url.port(), None);
237+
}
238+
239+
#[test]
240+
fn test_url_with_port() {
241+
let url = ParsedUrl::parse("http://example.com:8080/path").unwrap();
242+
assert_eq!(url.scheme(), "http");
243+
assert_eq!(url.host_str(), Some("example.com"));
244+
assert_eq!(url.port(), Some(8080));
245+
assert_eq!(url.path(), "/path");
246+
}
247+
248+
#[test]
249+
fn test_url_with_user() {
250+
let url = ParsedUrl::parse("http://user@example.com/path").unwrap();
251+
assert_eq!(url.scheme(), "http");
252+
assert_eq!(url.username(), "user");
253+
assert_eq!(url.host_str(), Some("example.com"));
254+
assert_eq!(url.path(), "/path");
255+
}
256+
257+
#[test]
258+
fn test_url_with_user_and_password() {
259+
let url = ParsedUrl::parse("http://user:pass@example.com/path").unwrap();
260+
assert_eq!(url.scheme(), "http");
261+
assert_eq!(url.username(), "user");
262+
assert_eq!(url.password(), Some("pass"));
263+
assert_eq!(url.host_str(), Some("example.com"));
264+
assert_eq!(url.path(), "/path");
265+
}
266+
267+
#[test]
268+
fn test_url_with_ipv6() {
269+
let url = ParsedUrl::parse("http://[::1]/path").unwrap();
270+
assert_eq!(url.scheme(), "http");
271+
assert_eq!(url.host_str(), Some("[::1]"));
272+
assert_eq!(url.path(), "/path");
273+
}
274+
275+
#[test]
276+
fn test_url_with_ipv6_and_port() {
277+
let url = ParsedUrl::parse("http://[::1]:8080/path").unwrap();
278+
assert_eq!(url.scheme(), "http");
279+
assert_eq!(url.host_str(), Some("[::1]"));
280+
assert_eq!(url.port(), Some(8080));
281+
assert_eq!(url.path(), "/path");
282+
}
283+
}

gix-url/tests/url/baseline.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,8 @@ mod baseline {
185185

186186
pub fn max_num_failures(&self) -> usize {
187187
match self {
188-
Kind::Unix => 198,
189-
Kind::Windows => 198 + 6,
188+
Kind::Unix => 195,
189+
Kind::Windows => 195 + 6,
190190
}
191191
}
192192

0 commit comments

Comments
 (0)