aws
diff --git a/‎sagemaker-train/tests/data/train/script_mode/custom_script.py‎
Lines changed: 191 additions & 0 deletions b/‎sagemaker-train/tests/data/train/script_mode/custom_script.py‎
Lines changed: 191 additions & 0 deletions
diff --git a/‎sagemaker-train/tests/data/train/script_mode/data/test/x_test.npy‎
19 KB b/‎sagemaker-train/tests/data/train/script_mode/data/test/x_test.npy‎
19 KB
diff --git a/‎sagemaker-train/tests/data/train/script_mode/data/test/y_test.npy‎
2.48 KB b/‎sagemaker-train/tests/data/train/script_mode/data/test/y_test.npy‎
2.48 KB
diff --git a/‎sagemaker-train/tests/data/train/script_mode/data/train/x_train.npy‎
75.4 KB b/‎sagemaker-train/tests/data/train/script_mode/data/train/x_train.npy‎
75.4 KB
diff --git a/‎sagemaker-train/tests/data/train/script_mode/data/train/y_train.npy‎
9.53 KB b/‎sagemaker-train/tests/data/train/script_mode/data/train/y_train.npy‎
9.53 KB
diff --git a/‎sagemaker-train/tests/data/train/script_mode/pytorch_model_def.py‎
Lines changed: 23 additions & 0 deletions b/‎sagemaker-train/tests/data/train/script_mode/pytorch_model_def.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎sagemaker-train/tests/data/train/script_mode/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎sagemaker-train/tests/data/train/script_mode/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sagemaker-train/tests/integ/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎sagemaker-train/tests/integ/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sagemaker-train/tests/integ/train/aws_batch/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎sagemaker-train/tests/integ/train/aws_batch/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎sagemaker-train/tests/integ/train/aws_batch/manager.py‎
Lines changed: 133 additions & 0 deletions b/‎sagemaker-train/tests/integ/train/aws_batch/manager.py‎
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,191 @@
+# flake8: noqa
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def parse_args():
+    """
+    Parse the command line arguments
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model")),
+        help="Directory to save the model",
+    )
+    parser.add_argument(
+        "--train-dir",
+        type=str,
+        default=os.environ.get("SM_CHANNEL_TRAIN", os.path.join(current_dir, "data/train")),
+        help="Directory containing training data",
+    )
+    parser.add_argument(
+        "--test-dir",
+        type=str,
+        default=os.environ.get("SM_CHANNEL_TEST", os.path.join(current_dir, "data/test")),
+        help="Directory containing testing data",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        help="Batch size for training",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=1,
+        help="Number of epochs for training",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=0.1,
+        help="Learning rate for training",
+    )
+    return parser.parse_args()
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+    args = parse_args()
+    # Directories: train, test and model
+    train_dir = args.train_dir
+    test_dir = args.test_dir
+    model_dir = args.model_dir
+
+    # Load the training and testing data
+    x_train, y_train = get_train_data(train_dir)
+    x_test, y_test = get_test_data(test_dir)
+    train_ds = TensorDataset(x_train, y_train)
+
+    # Training parameters - used to configure the training loop
+    batch_size = args.batch_size
+    epochs = args.epochs
+    learning_rate = args.learning_rate
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    # Define the model, loss function and optimizer
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # Test the model
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    # Save the model
+    os.makedirs(model_dir, exist_ok=True)
+    torch.save(model.state_dict(), model_dir + "/model.pth")
+    inference_code_path = model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    shutil.copy("custom_script.py", inference_code_path)
+    shutil.copy("pytorch_model_def.py", inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+    print("Running the training job ...\n")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
@@ -0,0 +1,23 @@
+# flake8: noqa
+import torch
+import torch.nn as nn
+
+
+class NeuralNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(8, 8)
+        self.fc2 = nn.Linear(8, 6)
+        self.fc3 = nn.Linear(6, 1)
+
+    def forward(self, x):
+        x = torch.tanh(self.fc1(x))
+        x = torch.sigmoid(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def get_model():
+
+    model = NeuralNet()
+    return model
@@ -0,0 +1,3 @@
+numpy
+-f https://download.pytorch.org/whl/torch_stable.html
+torch==2.7.0
@@ -12,3 +12,7 @@
 # language governing permissions and limitations under the License.
 """This module contains the Integ Tests for SageMaker PySDK Training."""
 from __future__ import absolute_import
+
+import os
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
@@ -0,0 +1,13 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""AWS Batch integration tests"""
@@ -0,0 +1,133 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import time
+
+
+class BatchTestResourceManager:
+
+    def __init__(
+        self,
+        batch_client,
+        queue_name="pysdk-test-queue",
+        service_env_name="pysdk-test-queue-service-environment",
+    ):
+        self.batch_client = batch_client
+        self.queue_name = queue_name
+        self.service_environment_name = service_env_name
+
+    def _create_or_get_service_environment(self, service_environment_name):
+        print(f"Creating service environment: {service_environment_name}")
+        try:
+            response = self.batch_client.create_service_environment(
+                serviceEnvironmentName=service_environment_name,
+                serviceEnvironmentType="SAGEMAKER_TRAINING",
+                capacityLimits=[{"maxCapacity": 10, "capacityUnit": "NUM_INSTANCES"}],
+            )
+            print(f"Service environment {service_environment_name} created successfully.")
+            return response
+        except Exception as e:
+            if "Object already exists" in str(e):
+                print("Resource already exists. Fetching existing resource.")
+                response = self.batch_client.describe_service_environments(
+                    serviceEnvironments=[service_environment_name]
+                )
+                return response["serviceEnvironments"][0]
+            else:
+                print(f"Error creating service environment: {e}")
+                raise
+
+    def _create_or_get_queue(self, queue_name, service_environment_arn):
+
+        print(f"Creating job queue: {queue_name}")
+        try:
+            response = self.batch_client.create_job_queue(
+                jobQueueName=queue_name,
+                priority=1,
+                computeEnvironmentOrder=[],
+                serviceEnvironmentOrder=[
+                    {
+                        "order": 1,
+                        "serviceEnvironment": service_environment_arn,
+                    },
+                ],
+                jobQueueType="SAGEMAKER_TRAINING",
+            )
+            print(f"Job queue {queue_name} created successfully.")
+            return response
+        except Exception as e:
+            if "Object already exists" in str(e):
+                print("Resource already exists. Fetching existing resource.")
+                response = self.batch_client.describe_job_queues(jobQueues=[queue_name])
+                return response["jobQueues"][0]
+            else:
+                print(f"Error creating job queue: {e}")
+                raise
+
+    def _update_queue_state(self, queue_name, state):
+        try:
+            print(f"Updating queue {queue_name} to state {state}")
+            response = self.batch_client.update_job_queue(jobQueue=queue_name, state=state)
+            return response
+        except Exception as e:
+            print(f"Error updating queue: {e}")
+
+    def _update_service_environment_state(self, service_environment_name, state):
+        print(f"Updating service environment {service_environment_name} to state {state}")
+        try:
+            response = self.batch_client.update_service_environment(
+                serviceEnvironment=service_environment_name, state=state
+            )
+            return response
+        except Exception as e:
+            print(f"Error updating service environment: {e}")
+
+    def _wait_for_queue_state(self, queue_name, state):
+        print(f"Waiting for queue {queue_name} to be {state}...")
+        while True:
+            response = self.batch_client.describe_job_queues(jobQueues=[queue_name])
+            print(f"Current state: {response}")
+            if response["jobQueues"][0]["state"] == state:
+                break
+            time.sleep(5)
+        print(f"Queue {queue_name} is now {state}.")
+
+    def _wait_for_service_environment_state(self, service_environment_name, state):
+        print(f"Waiting for service environment {service_environment_name} to be {state}...")
+        while True:
+            response = self.batch_client.describe_service_environments(
+                serviceEnvironments=[service_environment_name]
+            )
+            print(f"Current state: {response}")
+            if response["serviceEnvironments"][0]["state"] == state:
+                break
+            time.sleep(5)
+        print(f"Service environment {service_environment_name} is now {state}.")
+
+    def get_or_create_resources(self, queue_name=None, service_environment_name=None):
+        queue_name = queue_name or self.queue_name
+        service_environment_name = service_environment_name or self.service_environment_name
+
+        service_environment = self._create_or_get_service_environment(service_environment_name)
+        if service_environment.get("state") != "ENABLED":
+            self._update_service_environment_state(service_environment_name, "ENABLED")
+            self._wait_for_service_environment_state(service_environment_name, "ENABLED")
+        time.sleep(10)
+
+        queue = self._create_or_get_queue(queue_name, service_environment["serviceEnvironmentArn"])
+        if queue.get("state") != "ENABLED":
+            self._update_queue_state(queue_name, "ENABLED")
+            self._wait_for_queue_state(queue_name, "ENABLED")
+        time.sleep(10)
+        return queue, service_environment
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+numpy`
	`2`	`+-f https://download.pytorch.org/whl/torch_stable.html`
	`3`	`+torch==2.7.0`