addressing review comments

pandeconscious · pandeconscious · commit 259424e6ba56 · 2025-11-18T23:29:04.000Z
diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py
@@ -16,8 +16,7 @@
 
 def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
     """
-    any ordered categorical columns are transformed to the respective
-    categorical codes while other columns remain untouched
+    Replace ordered categoricals with their codes, making a shallow copy if necessary.
     """
 
     result = df
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -262,31 +262,21 @@ def test_corr_rank_ordered_categorical(
     ):
         df = DataFrame(
             {
-                "ord_cat": Series(
-                    pd.Categorical(
-                        ["low", "m", "h", "vh"],
-                        categories=["low", "m", "h", "vh"],
-                        ordered=True,
-                    )
+                "ord_cat": pd.Categorical(
+                    ["low", "m", "h", "vh"],
+                    categories=["low", "m", "h", "vh"],
+                    ordered=True,
                 ),
-                "ord_cat_none": Series(
-                    pd.Categorical(
-                        ["low", "m", "h", None],
-                        categories=["low", "m", "h"],
-                        ordered=True,
-                    )
+                "ord_cat_none": pd.Categorical(
+                    ["low", "m", "h", None],
+                    categories=["low", "m", "h"],
+                    ordered=True,
                 ),
-                "ord_int": Series([0, 1, 2, 3]),
-                "ord_float": Series([2.0, 3.0, 4.5, 6.5]),
-                "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
-                "ord_cat_shuff": Series(
-                    pd.Categorical(
-                        ["m", "h", "vh", "low"],
-                        categories=["low", "m", "h", "vh"],
-                        ordered=True,
-                    )
+                "ord_cat_shuff": pd.Categorical(
+                    ["m", "h", "vh", "low"],
+                    categories=["low", "m", "h", "vh"],
+                    ordered=True,
                 ),
-                "ord_int_shuff": Series([2, 3, 0, 1]),
             }
         )
         corr_calc = df.corr(method=method)
@@ -300,24 +290,16 @@ def test_corr_rank_ordered_categorical_duplicate_columns(
         self,
         method,
     ):
+        cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
         df = DataFrame(
             {
-                "a": [1, 2, 3, 4],
-                "b": [4, 3, 2, 1],
+                "a": pd.array([1, 2, 3, 4], dtype=cat),
+                "b": pd.array([4, 3, 2, 1], dtype=cat),
                 "c": [4, 3, 2, 1],
                 "d": [10, 20, 30, 40],
                 "e": [100, 200, 300, 400],
             }
         )
-        df["a"] = (
-            df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
-        )
-        df["b"] = (
-            df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
-        )
-        df["c"] = (
-            df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
-        )
         df.columns = ["a", "a", "c", "c", "e"]
 
         corr_calc = df.corr(method=method)
diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py
@@ -2,9 +2,14 @@
 Tests for core/methods/corr.py
 """
 
-import pytest
 import numpy as np
-from pandas import DataFrame, Series, Categorical
+import pytest
+
+from pandas import (
+    Categorical,
+    DataFrame,
+    Series,
+)
 import pandas._testing as tm
 from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols
 
@@ -75,22 +80,22 @@
             # second 'dup' is non-categorical
             DataFrame(
                 {
-                    "dup": Series(
+                    "dup_1": Series(
                         Categorical(
                             ["low", "m", "h"],
                             categories=["low", "m", "h"],
                             ordered=True,
                         )
                     ),
-                    "dup": Series([5, 6, 7]),  # duplicate name, later column
+                    "dup_2": Series([5, 6, 7]),  # duplicate name, later column
                 }
             ),
             DataFrame(
                 {
                     # After transform: position 0 (ordered cat) becomes codes [0,1,2],
                     # position 1 remains untouched numbers [5,6,7].
-                    "dup": Series([0, 1, 2], dtype="int8"),
-                    "dup": Series([5, 6, 7]),
+                    "dup_1": Series([0, 1, 2], dtype="int8"),
+                    "dup_2": Series([5, 6, 7]),
                 }
             ),
             id="duplicate-names-ordered-first",
@@ -100,15 +105,15 @@
             # second 'dup' is ordered categorical, third 'dup' is ordered categorical
             DataFrame(
                 {
-                    "dup": Series(["a", "b", "c"]),  # non-categorical (object)
-                    "dup": Series(
+                    "dup_1": Series(["a", "b", "c"]),  # non-categorical (object)
+                    "dup_2": Series(
                         Categorical(
                             ["p", "q", None],
                             categories=["p", "q"],
                             ordered=True,
                         )
                     ),
-                    "dup": Series(
+                    "dup_3": Series(
                         Categorical(
                             ["low", "m", "h"],
                             categories=["low", "m", "h"],
@@ -121,16 +126,21 @@
                 {
                     # First stays object; second turns into codes [0, 1, NaN]
                     # and third changes into codes [0, 1, 2]
-                    "dup": Series(["a", "b", "c"]),
-                    "dup": Series([0.0, 1.0, np.nan]),
-                    "dup": Series([0, 1, 2], dtype="int8"),
+                    "dup_1": Series(["a", "b", "c"]),
+                    "dup_2": Series([0.0, 1.0, np.nan]),
+                    "dup_3": Series([0, 1, 2], dtype="int8"),
                 }
             ),
             id="duplicate-names-ordered-and-non-categorical-and-none",
         ),
     ],
 )
 def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df):
+    # duplicate columns creation for dup columns
+    if "dup_1" in input_df.columns:
+        input_df.columns = ["dup" for _ in range(len(input_df.columns))]
+        expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]
+
     out_df = transform_ord_cat_cols_to_coded_cols(input_df)
     assert list(out_df.columns) == list(expected_df.columns)
     for i, col in enumerate(out_df.columns):
diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py
@@ -187,19 +187,19 @@ def test_corr_callable_method(self, datetime_series):
 
     @pytest.mark.parametrize("method", ["kendall", "spearman"])
     @pytest.mark.parametrize(
-        "ord_cat_series",
+        "cat_series",
         [
-            Series(  # ordered categorical series
-                pd.Categorical(
-                    ["low", "med", "high", "very_high"],
-                    categories=["low", "med", "high", "very_high"],
+            Series(
+                pd.Categorical(  # ordered cat series
+                    ["low", "medium", "high"],
+                    categories=["low", "medium", "high"],
                     ordered=True,
                 )
             ),
-            Series(  # ordered categorical series with nan and a different ranking
-                pd.Categorical(
-                    ["h", "low", "vh", None],
-                    categories=["low", "m", "h", "vh"],
+            Series(
+                pd.Categorical(  # ordered cat series with NA
+                    ["low", "medium", "high", None],
+                    categories=["low", "medium", "high"],
                     ordered=True,
                 )
             ),
@@ -208,36 +208,23 @@ def test_corr_callable_method(self, datetime_series):
     @pytest.mark.parametrize(
         "other_series",
         [
-            Series(  # int series against which tord cat series is correlated
-                [0, 1, 2, 3]
-            ),
-            Series(  # float series against which ord cat series is correlated
-                [2.0, 3.0, 4.5, 6.5]
-            ),
-            Series(  # other ord cat series against which ord cat series is correlated
+            Series(  # other cat ordered series
                 pd.Categorical(
-                    ["high", "low", "very_high", "med"],
-                    categories=["low", "med", "high", "very_high"],
+                    ["m", "l", "h"],
+                    categories=["l", "m", "h"],
                     ordered=True,
                 )
             ),
+            # other non cat series
+            Series([2, 1, 3]),
         ],
     )
     def test_corr_rank_ordered_categorical(
         self,
         method,
-        ord_cat_series,
+        cat_series,
         other_series,
     ):
-        stats = pytest.importorskip("scipy.stats")
-        method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
-        ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan)
-
-        if other_series.dtype == "category" and other_series.cat.ordered:
-            other_series = other_series.cat.codes.replace(-1, np.nan)
-
-        corr_calc = ord_cat_series.corr(other_series, method=method)
-        corr_expected = method_scipy_func[method](
-            ord_ser_cat_codes, other_series, nan_policy="omit"
-        )[0]
-        tm.assert_almost_equal(corr_calc, corr_expected)
+        expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
+        corr_calc = cat_series.corr(other_series, method=method)
+        tm.assert_almost_equal(corr_calc, expected_corr[method])