aws
diff --git a/‎.github/ISSUE_TEMPLATE/bug_report.md‎
Lines changed: 37 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/bug_report.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/config.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/config.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/documentation-request.md‎
Lines changed: 17 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/documentation-request.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/feature_request.md‎
Lines changed: 20 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/feature_request.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎sagemaker-core/src/sagemaker/core/local/utils.py‎
Lines changed: 5 additions & 1 deletion b/‎sagemaker-core/src/sagemaker/core/local/utils.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sagemaker-core/tests/unit/local/test_local_utils.py‎
Lines changed: 8 additions & 5 deletions b/‎sagemaker-core/tests/unit/local/test_local_utils.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎sagemaker-train/src/sagemaker/ai_registry/dataset.py‎
Lines changed: 29 additions & 16 deletions b/‎sagemaker-train/src/sagemaker/ai_registry/dataset.py‎
Lines changed: 29 additions & 16 deletions
diff --git a/‎sagemaker-train/src/sagemaker/ai_registry/dataset_format_detector.py‎
Lines changed: 97 additions & 0 deletions b/‎sagemaker-train/src/sagemaker/ai_registry/dataset_format_detector.py‎
Lines changed: 97 additions & 0 deletions
@@ -0,0 +1,37 @@
+---
+name: Bug report
+about: File a report to help us reproduce and fix the problem
+title: ''
+labels: 'bug'
+assignees: ''
+
+---
+
+**PySDK Version**
+- [ ] PySDK V2 (2.x)
+- [ ] PySDK V3 (3.x)
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To reproduce**
+A clear, step-by-step set of instructions to reproduce the bug.
+The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots or logs**
+If applicable, add screenshots or logs to help explain your problem.
+
+**System information**
+A description of your system. Please provide:
+- **SageMaker Python SDK version**:
+- **Framework name (eg. PyTorch) or algorithm (eg. KMeans)**:
+- **Framework version**:
+- **Python version**:
+- **CPU or GPU**:
+- **Custom Docker image (Y/N)**:
+
+**Additional context**
+Add any other context about the problem here.
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Ask a question
+    url: https://github.com/aws/sagemaker-python-sdk/discussions
+    about: Use GitHub Discussions to ask and answer questions
@@ -0,0 +1,17 @@
+---
+name: Documentation request
+about: Request improved documentation
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**What did you find confusing? Please describe.**
+A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...]
+
+**Describe how documentation can be improved**
+A clear and concise description of where documentation was lacking and how it can be improved.
+
+**Additional context**
+Add any other context or screenshots about the documentation request here.
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest new functionality for this library
+title: ''
+labels: 'feature request'
+assignees: ''
+
+---
+
+**Describe the feature you'd like**
+A clear and concise description of the functionality you want.
+
+**How would this feature be used? Please describe.**
+A clear and concise description of the use case for this feature. Please provide an example, if possible.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
@@ -137,7 +137,11 @@ def get_child_process_ids(pid):
     Returns:
         (List[int]): Child process ids
     """
-    cmd = f"pgrep -P {pid}".split()
+    if not str(pid).isdigit():
+        raise ValueError("Invalid PID")
+    
+    cmd = ["pgrep", "-P", str(pid)]
+    
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     output, err = process.communicate()
     if err:
 
@@ -103,21 +103,24 @@ def test_recursive_copy(copy_tree, m_os_path):
 @patch("sagemaker.core.local.utils.os")
 @patch("sagemaker.core.local.utils.get_child_process_ids")
 def test_kill_child_processes(m_get_child_process_ids, m_os):
-    m_get_child_process_ids.return_value = ["child_pids"]
-    kill_child_processes("pid")
-    m_os.kill.assert_called_with("child_pids", 15)
+    m_get_child_process_ids.return_value = ["345"]
+    kill_child_processes("123")
+    m_os.kill.assert_called_with("345", 15)
 
 
 @patch("sagemaker.core.local.utils.subprocess")
 def test_get_child_process_ids(m_subprocess):
-    cmd = "pgrep -P pid".split()
+    cmd = "pgrep -P 123".split()
     process_mock = Mock()
     attrs = {"communicate.return_value": (b"\n", False), "returncode": 0}
     process_mock.configure_mock(**attrs)
     m_subprocess.Popen.return_value = process_mock
-    get_child_process_ids("pid")
+    get_child_process_ids("123")
     m_subprocess.Popen.assert_called_with(cmd, stdout=m_subprocess.PIPE, stderr=m_subprocess.PIPE)
 
+def test_get_child_process_ids_exception():
+    with pytest.raises(ValueError, match="Invalid PID"):
+        get_child_process_ids("abc")
 
 @patch("sagemaker.core.local.utils.subprocess")
 def test_get_docker_host(m_subprocess):
 
@@ -24,6 +24,7 @@
 
 import pandas as pd
 
+from sagemaker.ai_registry.dataset_format_detector import DatasetFormatDetector
 from sagemaker.ai_registry.air_hub import AIRHub
 from sagemaker.ai_registry.air_utils import _determine_new_version, _get_default_bucket
 from sagemaker.ai_registry.air_constants import (
@@ -179,6 +180,21 @@ def _validate_dataset_file(cls, file_path: str) -> None:
                 max_size_mb = DATASET_MAX_FILE_SIZE_BYTES / (1024 * 1024)
                 raise ValueError(f"File size {file_size_mb:.2f} MB exceeds maximum allowed size of {max_size_mb:.0f} MB")
 
+    @classmethod
+    def _validate_dataset_format(cls, file_path: str) -> None:
+        """Validate dataset format using DatasetFormatDetector.
+
+        Args:
+            file_path: Path to the dataset file (local path)
+
+        Raises:
+            ValueError: If dataset format cannot be detected
+        """
+        detector = DatasetFormatDetector()
+        format_name = detector.validate_dataset(file_path)
+        if format_name is False:
+            raise ValueError(f"Unable to detect format for {file_path}. Please provide a valid dataset file.")
+
     @classmethod
     @_telemetry_emitter(feature=Feature.MODEL_CUSTOMIZATION, func_name="DataSet.get")
     def get(cls, name: str, sagemaker_session=None) -> "DataSet":
@@ -257,28 +273,25 @@ def create(
             s3_prefix = s3_key  # Use full path including filename
             method = DataSetMethod.GENERATED
 
-            # Download and validate if customization technique is provided
-            if customization_technique:
-                with tempfile.NamedTemporaryFile(
-                    delete=False, suffix=os.path.splitext(s3_key)[1]
-                ) as tmp_file:
-                    local_path = tmp_file.name
-                
-                try:
-                    AIRHub.download_from_s3(source, local_path)
-                    validate_dataset(local_path, customization_technique.value)
-                finally:
-                    if os.path.exists(local_path):
-                        os.remove(local_path)
+            # Download and validate format
+            with tempfile.NamedTemporaryFile(
+                delete=False, suffix=os.path.splitext(s3_key)[1]
+            ) as tmp_file:
+                local_path = tmp_file.name
+
+            try:
+                AIRHub.download_from_s3(source, local_path)
+                cls._validate_dataset_format(local_path)
+            finally:
+                if os.path.exists(local_path):
+                    os.remove(local_path)
         else:
             # Local file - upload to S3
             bucket_name = _get_default_bucket()
             s3_prefix = _get_default_s3_prefix(name)
             method = DataSetMethod.UPLOADED
 
-            if customization_technique:
-                validate_dataset(source, customization_technique.value)
-            
+            cls._validate_dataset_format(source)
             AIRHub.upload_to_s3(bucket_name, s3_prefix, source)
 
         # Create hub content document
 
@@ -0,0 +1,97 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import json
+from typing import Dict, Any, Optional
+from pathlib import Path
+
+
+class DatasetFormatDetector:
+    """Utility class for detecting dataset formats."""
+    
+    # Schema directory
+    SCHEMA_DIR = Path(__file__).parent / "schemas"
+    
+    @staticmethod
+    def _load_schema(format_name: str) -> Dict[str, Any]:
+        """Load JSON schema for a format."""
+        schema_path = DatasetFormatDetector.SCHEMA_DIR / f"{format_name}.json"
+        if schema_path.exists():
+            with open(schema_path) as f:
+                return json.load(f)
+        return {}
+    
+    @staticmethod
+    def validate_dataset(file_path: str) -> bool:
+        """
+        Validate if the dataset adheres to any known format.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            True if dataset is valid according to any known format, False otherwise
+        """
+        import jsonschema
+        
+        # Schema-based formats
+        schema_formats = [
+            "dpo", "converse", "hf_preference", "hf_prompt_completion",
+            "verl", "openai_chat", "genqa"
+        ]
+        
+        try:
+            with open(file_path, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        data = json.loads(line)
+                        
+                        # Try schema validation first
+                        for format_name in schema_formats:
+                            schema = DatasetFormatDetector._load_schema(format_name)
+                            if schema:
+                                try:
+                                    jsonschema.validate(instance=data, schema=schema)
+                                    return True
+                                except jsonschema.exceptions.ValidationError:
+                                    continue
+                        
+                        # Check for RFT-style format (messages + additional fields)
+                        if DatasetFormatDetector._is_rft_format(data):
+                            return True
+                        break
+            return False
+        except (json.JSONDecodeError, FileNotFoundError, IOError):
+            return False
+    
+    @staticmethod
+    def _is_rft_format(data: Dict[str, Any]) -> bool:
+        """Check if data matches RFT format pattern."""
+        if not isinstance(data, dict) or "messages" not in data:
+            return False
+        
+        messages = data["messages"]
+        if not isinstance(messages, list) or not messages:
+            return False
+        
+        # Check message structure
+        for msg in messages:
+            if not isinstance(msg, dict):
+                return False
+            if "role" not in msg or "content" not in msg:
+                return False
+            if not isinstance(msg["role"], str) or not isinstance(msg["content"], str):
+                return False
+        
+        return True