From 35ed4bbf2adba8c5dbfd4c55bb34e35c944cc069 Mon Sep 17 00:00:00 2001 From: Greg Richardson Date: Mon, 8 Dec 2025 21:05:05 -0700 Subject: [PATCH 1/3] fix: s3 endpoints with path --- packages/duckdb-wasm/src/utils/s3_helper.ts | 20 +- packages/duckdb-wasm/test/s3_helper.test.ts | 209 ++++++++++++++++++++ 2 files changed, 224 insertions(+), 5 deletions(-) create mode 100644 packages/duckdb-wasm/test/s3_helper.test.ts diff --git a/packages/duckdb-wasm/src/utils/s3_helper.ts b/packages/duckdb-wasm/src/utils/s3_helper.ts index 46de8456f..cb2db00f5 100644 --- a/packages/duckdb-wasm/src/utils/s3_helper.ts +++ b/packages/duckdb-wasm/src/utils/s3_helper.ts @@ -20,12 +20,20 @@ export interface S3PayloadParams { contentType: string | null; } +const getEndpointPathPrefix = function (config: S3Config | undefined): string { + if (config?.endpoint?.startsWith('http')) { + const endpointUrl = new URL(config.endpoint); + // Return the path prefix if present, otherwise empty string + return endpointUrl.pathname !== '/' ? endpointUrl.pathname : ''; + } + return ''; +}; + const getHTTPHost = function (config: S3Config | undefined, url: string, bucket: string): string { if (config?.endpoint?.startsWith('http')) { - // Endpoint is a full url, we append the bucket - const httpHost = `${config?.endpoint}`; - const offset = httpHost.indexOf('://') + 3; - return httpHost.substring(offset); + // Endpoint is a full url, extract just the hostname (no path) + const endpointUrl = new URL(config.endpoint); + return endpointUrl.host; } else if (config?.endpoint) { // Endpoint is not a full url and the https://{bucket}.{domain} format will be used return `${bucket}.${config?.endpoint}`; @@ -43,7 +51,9 @@ export function getS3Params(config: S3Config | undefined, url: string, method: s // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-bucket-intro.html#path-style-url-ex let path = parsedS3Url.path; if (isPathStyleAccess(config)) { - path = `/${parsedS3Url.bucket}${path}`; + // Extract endpoint path prefix if present (e.g., "/some/path/prefix" from "https://host/some/path/prefix") + const endpointPathPrefix = getEndpointPathPrefix(config); + path = `${endpointPathPrefix}/${parsedS3Url.bucket}${path}`; } return { url: path, diff --git a/packages/duckdb-wasm/test/s3_helper.test.ts b/packages/duckdb-wasm/test/s3_helper.test.ts new file mode 100644 index 000000000..6515715d1 --- /dev/null +++ b/packages/duckdb-wasm/test/s3_helper.test.ts @@ -0,0 +1,209 @@ +import { getS3Params, parseS3Url, getHTTPUrl } from '../src/utils/s3_helper'; +import { S3Config } from '../src/bindings'; + +describe('S3 Helper', () => { + describe('parseS3Url', () => { + it('parses basic s3:// URL', () => { + const result = parseS3Url('s3://my-bucket/path/to/object.txt'); + expect(result).toEqual({ + bucket: 'my-bucket', + path: '/path/to/object.txt', + }); + }); + + it('throws on non-s3:// URL', () => { + expect(() => parseS3Url('https://example.com/file.txt')).toThrow("URL needs to start with s3://"); + }); + + it('throws on URL without path', () => { + expect(() => parseS3Url('s3://my-bucket')).toThrow("URL needs to contain a '/' after the host"); + }); + }); + + describe('getS3Params - path-style access with endpoint path prefix', () => { + it('includes endpoint path prefix in signed URL for path-style access', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com/some/path/prefix', + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + // Path should include: endpoint prefix + bucket + object path + expect(params.url).toBe('/some/path/prefix/my-bucket/object.txt'); + // Host should NOT include the path prefix + expect(params.host).toBe('s3.example.com'); + }); + + it('handles endpoint without path prefix', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com', + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + // Path should include: bucket + object path (no prefix) + expect(params.url).toBe('/my-bucket/object.txt'); + expect(params.host).toBe('s3.example.com'); + }); + + it('handles endpoint with trailing slash', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com/prefix/', + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + expect(params.url).toBe('/prefix//my-bucket/object.txt'); + expect(params.host).toBe('s3.example.com'); + }); + + it('includes port in host when present', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com:9000/prefix', + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + expect(params.url).toBe('/prefix/my-bucket/object.txt'); + // Host should include port + expect(params.host).toBe('s3.example.com:9000'); + }); + + it('handles deep path prefixes', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com/api/v1/storage/s3', + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/folder/file.txt', 'GET'); + + expect(params.url).toBe('/api/v1/storage/s3/my-bucket/folder/file.txt'); + expect(params.host).toBe('s3.example.com'); + }); + }); + + describe('getS3Params - virtual-hosted style (non-http endpoint)', () => { + it('uses virtual-hosted style for non-http endpoints', () => { + const config: S3Config = { + endpoint: 's3.custom-region.amazonaws.com', + region: 'custom-region', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + // Virtual-hosted style: bucket NOT in path + expect(params.url).toBe('/object.txt'); + // Bucket prepended to hostname + expect(params.host).toBe('my-bucket.s3.custom-region.amazonaws.com'); + }); + + it('uses default AWS S3 URL when no endpoint provided', () => { + const config: S3Config = { + region: 'us-east-1', + accessKeyId: 'AKIAIOSFODNN7EXAMPLE', + secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + }; + + const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); + + expect(params.url).toBe('/object.txt'); + expect(params.host).toBe('my-bucket.s3.amazonaws.com'); + }); + }); + + describe('getHTTPUrl', () => { + it('constructs full HTTP URL for path-style access with prefix', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com/prefix', + region: 'us-east-1', + }; + + const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); + + expect(httpUrl).toBe('https://s3.example.com/prefix/my-bucket/object.txt'); + }); + + it('constructs full HTTP URL for virtual-hosted style', () => { + const config: S3Config = { + endpoint: 's3.amazonaws.com', + region: 'us-east-1', + }; + + const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); + + expect(httpUrl).toBe('https://my-bucket.s3.amazonaws.com/object.txt'); + }); + + it('constructs full HTTP URL with default endpoint', () => { + const config: S3Config = { + region: 'us-east-1', + }; + + const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); + + expect(httpUrl).toBe('https://my-bucket.s3.amazonaws.com/object.txt'); + }); + }); + + describe('getS3Params - general properties', () => { + it('sets method correctly', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com', + region: 'us-east-1', + accessKeyId: 'test-key', + secretAccessKey: 'test-secret', + }; + + const getParams = getS3Params(config, 's3://bucket/object', 'GET'); + const putParams = getS3Params(config, 's3://bucket/object', 'PUT'); + + expect(getParams.method).toBe('GET'); + expect(putParams.method).toBe('PUT'); + }); + + it('includes session token when provided', () => { + const config: S3Config = { + endpoint: 'https://s3.example.com', + region: 'us-east-1', + accessKeyId: 'test-key', + secretAccessKey: 'test-secret', + sessionToken: 'test-session-token', + }; + + const params = getS3Params(config, 's3://bucket/object', 'GET'); + + expect(params.sessionToken).toBe('test-session-token'); + }); + + it('generates date/datetime fields', () => { + const config: S3Config = { + region: 'us-east-1', + accessKeyId: 'test-key', + secretAccessKey: 'test-secret', + }; + + const params = getS3Params(config, 's3://bucket/object', 'GET'); + + // Date format: YYYYMMDD + expect(params.dateNow).toMatch(/^\d{8}$/); + // Datetime format: YYYYMMDDTHHmmssZ + expect(params.datetimeNow).toMatch(/^\d{8}T\d{6}Z$/); + }); + }); +}); From 0f9748c2586bea23b7b7b08eee006be73765705d Mon Sep 17 00:00:00 2001 From: Greg Richardson Date: Tue, 9 Dec 2025 21:48:39 +0000 Subject: [PATCH 2/3] chore: remove incorrect tests --- packages/duckdb-wasm/test/s3_helper.test.ts | 209 -------------------- 1 file changed, 209 deletions(-) delete mode 100644 packages/duckdb-wasm/test/s3_helper.test.ts diff --git a/packages/duckdb-wasm/test/s3_helper.test.ts b/packages/duckdb-wasm/test/s3_helper.test.ts deleted file mode 100644 index 6515715d1..000000000 --- a/packages/duckdb-wasm/test/s3_helper.test.ts +++ /dev/null @@ -1,209 +0,0 @@ -import { getS3Params, parseS3Url, getHTTPUrl } from '../src/utils/s3_helper'; -import { S3Config } from '../src/bindings'; - -describe('S3 Helper', () => { - describe('parseS3Url', () => { - it('parses basic s3:// URL', () => { - const result = parseS3Url('s3://my-bucket/path/to/object.txt'); - expect(result).toEqual({ - bucket: 'my-bucket', - path: '/path/to/object.txt', - }); - }); - - it('throws on non-s3:// URL', () => { - expect(() => parseS3Url('https://example.com/file.txt')).toThrow("URL needs to start with s3://"); - }); - - it('throws on URL without path', () => { - expect(() => parseS3Url('s3://my-bucket')).toThrow("URL needs to contain a '/' after the host"); - }); - }); - - describe('getS3Params - path-style access with endpoint path prefix', () => { - it('includes endpoint path prefix in signed URL for path-style access', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com/some/path/prefix', - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - // Path should include: endpoint prefix + bucket + object path - expect(params.url).toBe('/some/path/prefix/my-bucket/object.txt'); - // Host should NOT include the path prefix - expect(params.host).toBe('s3.example.com'); - }); - - it('handles endpoint without path prefix', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com', - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - // Path should include: bucket + object path (no prefix) - expect(params.url).toBe('/my-bucket/object.txt'); - expect(params.host).toBe('s3.example.com'); - }); - - it('handles endpoint with trailing slash', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com/prefix/', - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - expect(params.url).toBe('/prefix//my-bucket/object.txt'); - expect(params.host).toBe('s3.example.com'); - }); - - it('includes port in host when present', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com:9000/prefix', - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - expect(params.url).toBe('/prefix/my-bucket/object.txt'); - // Host should include port - expect(params.host).toBe('s3.example.com:9000'); - }); - - it('handles deep path prefixes', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com/api/v1/storage/s3', - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/folder/file.txt', 'GET'); - - expect(params.url).toBe('/api/v1/storage/s3/my-bucket/folder/file.txt'); - expect(params.host).toBe('s3.example.com'); - }); - }); - - describe('getS3Params - virtual-hosted style (non-http endpoint)', () => { - it('uses virtual-hosted style for non-http endpoints', () => { - const config: S3Config = { - endpoint: 's3.custom-region.amazonaws.com', - region: 'custom-region', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - // Virtual-hosted style: bucket NOT in path - expect(params.url).toBe('/object.txt'); - // Bucket prepended to hostname - expect(params.host).toBe('my-bucket.s3.custom-region.amazonaws.com'); - }); - - it('uses default AWS S3 URL when no endpoint provided', () => { - const config: S3Config = { - region: 'us-east-1', - accessKeyId: 'AKIAIOSFODNN7EXAMPLE', - secretAccessKey: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', - }; - - const params = getS3Params(config, 's3://my-bucket/object.txt', 'GET'); - - expect(params.url).toBe('/object.txt'); - expect(params.host).toBe('my-bucket.s3.amazonaws.com'); - }); - }); - - describe('getHTTPUrl', () => { - it('constructs full HTTP URL for path-style access with prefix', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com/prefix', - region: 'us-east-1', - }; - - const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); - - expect(httpUrl).toBe('https://s3.example.com/prefix/my-bucket/object.txt'); - }); - - it('constructs full HTTP URL for virtual-hosted style', () => { - const config: S3Config = { - endpoint: 's3.amazonaws.com', - region: 'us-east-1', - }; - - const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); - - expect(httpUrl).toBe('https://my-bucket.s3.amazonaws.com/object.txt'); - }); - - it('constructs full HTTP URL with default endpoint', () => { - const config: S3Config = { - region: 'us-east-1', - }; - - const httpUrl = getHTTPUrl(config, 's3://my-bucket/object.txt'); - - expect(httpUrl).toBe('https://my-bucket.s3.amazonaws.com/object.txt'); - }); - }); - - describe('getS3Params - general properties', () => { - it('sets method correctly', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com', - region: 'us-east-1', - accessKeyId: 'test-key', - secretAccessKey: 'test-secret', - }; - - const getParams = getS3Params(config, 's3://bucket/object', 'GET'); - const putParams = getS3Params(config, 's3://bucket/object', 'PUT'); - - expect(getParams.method).toBe('GET'); - expect(putParams.method).toBe('PUT'); - }); - - it('includes session token when provided', () => { - const config: S3Config = { - endpoint: 'https://s3.example.com', - region: 'us-east-1', - accessKeyId: 'test-key', - secretAccessKey: 'test-secret', - sessionToken: 'test-session-token', - }; - - const params = getS3Params(config, 's3://bucket/object', 'GET'); - - expect(params.sessionToken).toBe('test-session-token'); - }); - - it('generates date/datetime fields', () => { - const config: S3Config = { - region: 'us-east-1', - accessKeyId: 'test-key', - secretAccessKey: 'test-secret', - }; - - const params = getS3Params(config, 's3://bucket/object', 'GET'); - - // Date format: YYYYMMDD - expect(params.dateNow).toMatch(/^\d{8}$/); - // Datetime format: YYYYMMDDTHHmmssZ - expect(params.datetimeNow).toMatch(/^\d{8}T\d{6}Z$/); - }); - }); -}); From d820ff8ce6e4b76cad34ecd638e2b1acaa17f5f0 Mon Sep 17 00:00:00 2001 From: Greg Richardson Date: Tue, 9 Dec 2025 15:19:28 -0700 Subject: [PATCH 3/3] refactor: simplify s3 path fixes --- packages/duckdb-wasm/src/utils/s3_helper.ts | 27 +++++++++------------ 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/packages/duckdb-wasm/src/utils/s3_helper.ts b/packages/duckdb-wasm/src/utils/s3_helper.ts index cb2db00f5..303a363e1 100644 --- a/packages/duckdb-wasm/src/utils/s3_helper.ts +++ b/packages/duckdb-wasm/src/utils/s3_helper.ts @@ -20,18 +20,9 @@ export interface S3PayloadParams { contentType: string | null; } -const getEndpointPathPrefix = function (config: S3Config | undefined): string { - if (config?.endpoint?.startsWith('http')) { - const endpointUrl = new URL(config.endpoint); - // Return the path prefix if present, otherwise empty string - return endpointUrl.pathname !== '/' ? endpointUrl.pathname : ''; - } - return ''; -}; - const getHTTPHost = function (config: S3Config | undefined, url: string, bucket: string): string { if (config?.endpoint?.startsWith('http')) { - // Endpoint is a full url, extract just the hostname (no path) + // Endpoint is a full url, extract just the host (no path) const endpointUrl = new URL(config.endpoint); return endpointUrl.host; } else if (config?.endpoint) { @@ -46,14 +37,20 @@ const getHTTPHost = function (config: S3Config | undefined, url: string, bucket: export function getS3Params(config: S3Config | undefined, url: string, method: string): S3Params { const parsedS3Url = parseS3Url(url); - // when using S3 path-style access, the signed URL should also include the bucket name, - // as it is present in the HTTP URL path. + // when using S3 path-style access, the signed URL should also include the endpoint's path + bucket name, + // as they will both be present in the HTTP URL path. // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-bucket-intro.html#path-style-url-ex let path = parsedS3Url.path; if (isPathStyleAccess(config)) { - // Extract endpoint path prefix if present (e.g., "/some/path/prefix" from "https://host/some/path/prefix") - const endpointPathPrefix = getEndpointPathPrefix(config); - path = `${endpointPathPrefix}/${parsedS3Url.bucket}${path}`; + // Extract endpoint path if present (e.g., "/some/path" from "https://host/some/path") + let endpointPath = ''; + if (config?.endpoint) { + const endpointUrl = new URL(config.endpoint); + if (endpointUrl.pathname !== '/') { + endpointPath = endpointUrl.pathname; + } + } + path = `${endpointPath}/${parsedS3Url.bucket}${path}`; } return { url: path,