Multi-GPU training (#366)

dewan-c · web-flow · commit 9fb6a3d1eafe · 2023-02-07T11:27:49.000-05:00
* Added code to run multi-node, multi-GPU training with Dask.
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ __pycache__
 .coverage*
 .mypy_cache/
 .idea/
+.DS_Store
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,8 @@ Pillow==9.1.1
 boto3==1.17.52
 botocore==1.20.52
 cryptography==35.0.0
+dask==2022.11.1
+dask-cuda==22.12.0
 gunicorn==19.10.0
 itsdangerous==2.0.1
 matplotlib==3.4.1
diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py
@@ -29,6 +29,7 @@
 from sagemaker_xgboost_container.constants.sm_env_constants import SM_OUTPUT_DATA_DIR
 from sagemaker_xgboost_container.constants.xgb_constants import (
     CUSTOMER_ERRORS,
+    MODEL_NAME,
     XGB_MAXIMIZE_METRICS,
 )
 from sagemaker_xgboost_container.data_utils import (
@@ -40,8 +41,6 @@
 )
 from sagemaker_xgboost_container.prediction_utils import ValidationPredictionRecorder
 
-MODEL_NAME = "xgboost-model"
-
 logger = logging.getLogger(__name__)
 
 
@@ -157,10 +156,6 @@ def sagemaker_train(
 
     validation_channel = validated_data_config.get("validation", None)
     combine_train_val = "_kfold" in validated_train_config
-    train_dmatrix, val_dmatrix, train_val_dmatrix = get_validated_dmatrices(
-        train_path, val_path, file_type, csv_weights, is_pipe, combine_train_val
-    )
-    checkpoint_dir = checkpoint_config.get("LocalPath", None)
     if val_path is not None:
         if train_path == val_path or os.path.basename(train_path) == os.path.basename(val_path):
             logger.warning(
@@ -170,6 +165,15 @@ def sagemaker_train(
         elif not is_pipe:
             # Check if there is potential data redundancy between training and validation sets
             check_data_redundancy(train_path, val_path)
+
+    # Obtain information about training resources to determine which distributed setup to use, if needed.
+    num_hosts = len(sm_hosts)
+
+    train_dmatrix, val_dmatrix, train_val_dmatrix = get_validated_dmatrices(
+        train_path, val_path, file_type, csv_weights, is_pipe, combine_train_val
+    )
+    checkpoint_dir = checkpoint_config.get("LocalPath", None)
+
     train_args = dict(
         train_cfg=validated_train_config,
         train_dmatrix=train_dmatrix,
@@ -179,9 +183,6 @@ def sagemaker_train(
         checkpoint_dir=checkpoint_dir,
     )
 
-    # Obtain information about training resources to determine whether to set up Rabit or not
-    num_hosts = len(sm_hosts)
-
     if num_hosts > 1:
         # Wait for hosts to find each other
         logging.info("Distributed node training with {} hosts: {}".format(num_hosts, sm_hosts))
diff --git a/src/sagemaker_xgboost_container/constants/sm_env_constants.py b/src/sagemaker_xgboost_container/constants/sm_env_constants.py
@@ -15,6 +15,7 @@
 # Resource related constants
 SM_CURRENT_HOST = "SM_CURRENT_HOST"
 SM_HOSTS = "SM_HOSTS"
+SM_NUM_GPUS = "SM_NUM_GPUS"
 
 # Data related constants
 SM_CHANNEL_TRAIN = "SM_CHANNEL_TRAIN"
diff --git a/src/sagemaker_xgboost_container/constants/xgb_constants.py b/src/sagemaker_xgboost_container/constants/xgb_constants.py
@@ -91,3 +91,5 @@
 BINARY_HINGE = "binary:hinge"
 MULTI_SOFTMAX = "multi:softmax"
 MULTI_SOFTPROB = "multi:softprob"
+
+MODEL_NAME = "xgboost-model"
diff --git a/src/sagemaker_xgboost_container/distributed_gpu/__init__.py b/src/sagemaker_xgboost_container/distributed_gpu/__init__.py
diff --git a/src/sagemaker_xgboost_container/distributed_gpu/dask_cluster_utils.py b/src/sagemaker_xgboost_container/distributed_gpu/dask_cluster_utils.py
@@ -0,0 +1,52 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License'). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the 'license' file accompanying this file. This file is
+# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import socket
+from subprocess import Popen
+
+from dask.distributed import Client
+
+from sagemaker_algorithm_toolkit.exceptions import AlgorithmError, PlatformError
+
+SCHEDULER_EXEC_PATH = "/miniconda3/bin/dask-scheduler"
+CUDA_WORKER_EXEC_PATH = "/miniconda3/bin/dask-cuda-worker"
+
+SCHEDULER_CONN_TIMEOUT = "20s"
+
+
+def start_daemons_in_current_instance(scheduler_address: str, is_scheduler_host: bool):
+    # Dask distributed scheduler API doc: https://docs.dask.org/en/stable/deploying-cli.html
+    scheduler_cli_command = [SCHEDULER_EXEC_PATH, "--no-dashboard"]
+    scheduler_conn_string = f"tcp://{scheduler_address}"
+    # Dask cuda worker API doc: https://docs.rapids.ai/api/dask-cuda/nightly/api.html
+    worker_cli_command = [CUDA_WORKER_EXEC_PATH, scheduler_conn_string, "--no-dashboard"]
+    if is_scheduler_host:
+        Popen(scheduler_cli_command)
+    try:
+        # Ensure that the scheduler is up before starting workers.
+        with Client(scheduler_address, timeout=SCHEDULER_CONN_TIMEOUT):
+            Popen(worker_cli_command)
+    except TimeoutError as e:
+        raise AlgorithmError(
+            f"Couldn't connect to scheduler after {SCHEDULER_CONN_TIMEOUT}. Please try re-running the training job."
+            f" Exception: {e}"
+        )
+
+
+def get_host_ip(host_name: str) -> str:
+    try:
+        host_ip = socket.gethostbyname(host_name)
+    except socket.gaierror as e:
+        # This shouldn't have happened, and it's not the user's fault.
+        raise PlatformError(f"Failed hostname resolution for host '{host_name}', exception: {e}")
+    return host_ip
diff --git a/src/sagemaker_xgboost_container/distributed_gpu/dask_data_utils.py b/src/sagemaker_xgboost_container/distributed_gpu/dask_data_utils.py
@@ -0,0 +1,65 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License'). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the 'license' file accompanying this file. This file is
+# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import os
+
+import dask.dataframe as dask_dataframe
+from dask.dataframe import DataFrame, Series
+from dask.distributed import Client, wait
+from xgboost.dask import DaskDMatrix
+
+from sagemaker_algorithm_toolkit.exceptions import AlgorithmError, UserError
+from sagemaker_xgboost_container.data_utils import CSV, PARQUET
+
+
+def _read_data(local_path: str, content_type: str) -> (DataFrame, Series):
+    if content_type == CSV:
+        dataframe = dask_dataframe.read_csv(os.path.join(local_path, "*.csv"), header=None)
+    elif content_type == PARQUET:
+        dataframe = dask_dataframe.read_parquet(local_path)
+    else:
+        raise UserError(f"Unexpected content type '{content_type}'. Supported content types are CSV and PARQUET.")
+
+    target_column = dataframe.columns[0]
+    labels = dataframe[target_column]
+    features = dataframe[dataframe.columns.difference([target_column])]
+
+    return features, labels
+
+
+def get_dataframe_dimensions(dataframe: DataFrame) -> (int, int):
+    df_shape = dataframe.shape
+    # Note that dataframe.shape[0].compute() is an expensive operation.
+    rows = df_shape[0].compute()
+    cols = df_shape[1]
+    return rows, cols
+
+
+def load_data_into_memory(client: Client, local_data_path: str, content_type: str) -> (DataFrame, Series):
+    try:
+        features, labels = _read_data(local_data_path, content_type)
+        # Due to the lazy nature of Dask collections,
+        # most data related errors will likely show up once data load is started here.
+        features, labels = client.persist([features, labels])
+        wait([features, labels])
+    except Exception as e:
+        raise UserError(f"Failed to load data. Exception: {e}")
+    return features, labels
+
+
+def create_dask_dmatrix(client: Client, features: DataFrame, labels: Series) -> DaskDMatrix:
+    try:
+        dmatrix = DaskDMatrix(client, features, labels)
+    except Exception as e:
+        raise AlgorithmError(f"Failed to create DaskDMatrix with given data. Exception: {e}")
+    return dmatrix
diff --git a/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py b/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py
@@ -0,0 +1,111 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License'). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the 'license' file accompanying this file. This file is
+# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import logging
+import os
+import socket
+import time
+from typing import Dict
+
+import xgboost as xgb
+from dask.distributed import Client
+
+from sagemaker_algorithm_toolkit import exceptions as exc
+from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME
+from sagemaker_xgboost_container.distributed_gpu.dask_cluster_utils import (
+    get_host_ip,
+    start_daemons_in_current_instance,
+)
+from sagemaker_xgboost_container.distributed_gpu.dask_data_utils import (
+    create_dask_dmatrix,
+    get_dataframe_dimensions,
+    load_data_into_memory,
+)
+
+logger = logging.getLogger(__name__)
+
+SCHEDULER_PORT = "8786"
+WAIT_FOR_ALL_WORKERS_TIMEOUT_SEC = 20
+WORKER_STAY_ALIVE_CHECK_FREQ_SEC = 10
+
+
+def run_training_with_dask(
+    hyperparameters: Dict,
+    train_path: str,
+    validation_path: str,
+    model_dir: str,
+    content_type: str,
+    sm_hosts: [str],
+    current_host: str,
+    num_gpus: int,
+):
+    scheduler_host = sm_hosts[0]
+    scheduler_host_ip = get_host_ip(scheduler_host)
+
+    scheduler_address = f"{scheduler_host_ip}:{SCHEDULER_PORT}"
+    is_scheduler_host = current_host == scheduler_host
+
+    start_daemons_in_current_instance(scheduler_address, is_scheduler_host)
+
+    total_num_workers = len(sm_hosts) * num_gpus
+
+    # We only need to submit the job from one instance.
+    if is_scheduler_host:
+        with Client(scheduler_address) as client:
+            # We ensure that all workers are present before proceeding.
+            client.wait_for_workers(total_num_workers, WAIT_FOR_ALL_WORKERS_TIMEOUT_SEC)
+
+            logging.info("Starting to read training data...")
+            watchlist = []
+
+            X_train, y_train = load_data_into_memory(client, train_path, content_type)
+
+            dtrain = create_dask_dmatrix(client, X_train, y_train)
+
+            # Log train data dimension for sanity check.
+            train_num_rows, train_num_cols = get_dataframe_dimensions(X_train)
+            logging.info(f"Train features matrix has {train_num_rows} rows and {train_num_cols} columns")
+
+            watchlist.append((dtrain, "train"))
+
+            if validation_path is not None:
+                X_valid, y_valid = load_data_into_memory(client, validation_path, content_type)
+                dvalid = create_dask_dmatrix(client, X_valid, y_valid)
+                watchlist.append((dvalid, "validation"))
+
+            logging.info("Data load complete. Starting training...")
+
+            try:
+                output = xgb.dask.train(
+                    client, hyperparameters, dtrain, num_boost_round=hyperparameters["num_round"], evals=watchlist
+                )
+                booster = output["booster"]
+
+                logging.info("Training complete. Saving model...")
+                booster.save_model(os.path.join(model_dir, MODEL_NAME))
+            except Exception as e:
+                exception_prefix = "XGB train call failed with exception"
+                raise exc.AlgorithmError(f"{exception_prefix}:\n {str(e)}")
+
+            logging.info("Terminating cluster...")
+
+    else:
+        scheduler = (scheduler_host_ip, int(SCHEDULER_PORT))
+        # Do not exit till the job is done.
+        while True:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as alive_socket:
+                alive_check = alive_socket.connect_ex(scheduler)
+                if alive_check != 0:
+                    logging.info("Received a shutdown signal from scheduler. Exiting...")
+                    break
+            time.sleep(WORKER_STAY_ALIVE_CHECK_FREQ_SEC)
diff --git a/test/resources/data/csv/multiple_files/train_1.csv b/test/resources/data/csv/multiple_files/train_1.csv
@@ -0,0 +1,5 @@
+0,1,0,0,0,0
+0,1,0,0,0,0
+0,1,0,0,0,0
+0,1,0,0,0,0
+1,0,1,0,0,0
diff --git a/test/resources/data/csv/multiple_files/train_2.csv b/test/resources/data/csv/multiple_files/train_2.csv
@@ -0,0 +1,5 @@
+0,1,0,0,0,0
+0,1,0,0,0,0
+0,1,0,0,0,0
+0,1,0,0,0,0
+1,0,1,0,0,0
diff --git a/test/resources/data/parquet/multiple_files/train_0.parquet b/test/resources/data/parquet/multiple_files/train_0.parquet
diff --git a/test/resources/data/parquet/multiple_files/train_1.parquet b/test/resources/data/parquet/multiple_files/train_1.parquet
diff --git a/test/unit/distributed_gpu/__init__.py b/test/unit/distributed_gpu/__init__.py
diff --git a/test/unit/distributed_gpu/test_dask_data_utils.py b/test/unit/distributed_gpu/test_dask_data_utils.py
@@ -0,0 +1,61 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License'). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the 'license' file accompanying this file. This file is
+# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import os
+import unittest
+from pathlib import Path
+
+from sagemaker_algorithm_toolkit.exceptions import UserError
+from sagemaker_xgboost_container.data_utils import CSV, LIBSVM, PARQUET
+from sagemaker_xgboost_container.distributed_gpu.dask_data_utils import _read_data
+
+
+class TestDaskDataUtils(unittest.TestCase):
+    NUM_ROWS_IN_EACH_FILE = 5
+    NUM_COLS_IN_EACH_FILE = 6
+
+    def setUp(self):
+        current_path = Path(os.path.abspath(__file__))
+        self.data_path_csv = os.path.join(
+            str(current_path.parent.parent.parent), "resources", "data", "csv", "csv_files"
+        )
+        self.data_path_csv_multiple = os.path.join(
+            str(current_path.parent.parent.parent), "resources", "data", "csv", "multiple_files"
+        )
+        self.data_path_parquet = os.path.join(
+            str(current_path.parent.parent.parent), "resources", "data", "parquet", "multiple_files"
+        )
+
+    def test_read_data_csv(self):
+        x, y = _read_data(self.data_path_csv, CSV)
+        assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE
+        assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1
+        assert y.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE
+
+    def test_read_data_csv_malformed_path(self):
+        x, y = _read_data(self.data_path_csv + "/", CSV)
+        assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE
+
+    def test_read_data_csv_multiple_files(self):
+        x, y = _read_data(self.data_path_csv_multiple, CSV)
+        assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2
+
+    def test_read_data_parquet(self):
+        x, y = _read_data(self.data_path_parquet, PARQUET)
+        assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2
+        assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1
+        assert y.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2
+
+    def test_read_data_unsupported_content(self):
+        with self.assertRaises(UserError):
+            _read_data(self.data_path_parquet, LIBSVM)