Skip to content

Commit 259424e

Browse files
addressing review comments
1 parent a625520 commit 259424e

File tree

4 files changed

+56
-78
lines changed

4 files changed

+56
-78
lines changed

pandas/core/methods/corr.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616

1717
def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
1818
"""
19-
any ordered categorical columns are transformed to the respective
20-
categorical codes while other columns remain untouched
19+
Replace ordered categoricals with their codes, making a shallow copy if necessary.
2120
"""
2221

2322
result = df

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -262,31 +262,21 @@ def test_corr_rank_ordered_categorical(
262262
):
263263
df = DataFrame(
264264
{
265-
"ord_cat": Series(
266-
pd.Categorical(
267-
["low", "m", "h", "vh"],
268-
categories=["low", "m", "h", "vh"],
269-
ordered=True,
270-
)
265+
"ord_cat": pd.Categorical(
266+
["low", "m", "h", "vh"],
267+
categories=["low", "m", "h", "vh"],
268+
ordered=True,
271269
),
272-
"ord_cat_none": Series(
273-
pd.Categorical(
274-
["low", "m", "h", None],
275-
categories=["low", "m", "h"],
276-
ordered=True,
277-
)
270+
"ord_cat_none": pd.Categorical(
271+
["low", "m", "h", None],
272+
categories=["low", "m", "h"],
273+
ordered=True,
278274
),
279-
"ord_int": Series([0, 1, 2, 3]),
280-
"ord_float": Series([2.0, 3.0, 4.5, 6.5]),
281-
"ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
282-
"ord_cat_shuff": Series(
283-
pd.Categorical(
284-
["m", "h", "vh", "low"],
285-
categories=["low", "m", "h", "vh"],
286-
ordered=True,
287-
)
275+
"ord_cat_shuff": pd.Categorical(
276+
["m", "h", "vh", "low"],
277+
categories=["low", "m", "h", "vh"],
278+
ordered=True,
288279
),
289-
"ord_int_shuff": Series([2, 3, 0, 1]),
290280
}
291281
)
292282
corr_calc = df.corr(method=method)
@@ -300,24 +290,16 @@ def test_corr_rank_ordered_categorical_duplicate_columns(
300290
self,
301291
method,
302292
):
293+
cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
303294
df = DataFrame(
304295
{
305-
"a": [1, 2, 3, 4],
306-
"b": [4, 3, 2, 1],
296+
"a": pd.array([1, 2, 3, 4], dtype=cat),
297+
"b": pd.array([4, 3, 2, 1], dtype=cat),
307298
"c": [4, 3, 2, 1],
308299
"d": [10, 20, 30, 40],
309300
"e": [100, 200, 300, 400],
310301
}
311302
)
312-
df["a"] = (
313-
df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
314-
)
315-
df["b"] = (
316-
df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
317-
)
318-
df["c"] = (
319-
df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
320-
)
321303
df.columns = ["a", "a", "c", "c", "e"]
322304

323305
corr_calc = df.corr(method=method)

pandas/tests/methods/corr.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22
Tests for core/methods/corr.py
33
"""
44

5-
import pytest
65
import numpy as np
7-
from pandas import DataFrame, Series, Categorical
6+
import pytest
7+
8+
from pandas import (
9+
Categorical,
10+
DataFrame,
11+
Series,
12+
)
813
import pandas._testing as tm
914
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols
1015

@@ -75,22 +80,22 @@
7580
# second 'dup' is non-categorical
7681
DataFrame(
7782
{
78-
"dup": Series(
83+
"dup_1": Series(
7984
Categorical(
8085
["low", "m", "h"],
8186
categories=["low", "m", "h"],
8287
ordered=True,
8388
)
8489
),
85-
"dup": Series([5, 6, 7]), # duplicate name, later column
90+
"dup_2": Series([5, 6, 7]), # duplicate name, later column
8691
}
8792
),
8893
DataFrame(
8994
{
9095
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
9196
# position 1 remains untouched numbers [5,6,7].
92-
"dup": Series([0, 1, 2], dtype="int8"),
93-
"dup": Series([5, 6, 7]),
97+
"dup_1": Series([0, 1, 2], dtype="int8"),
98+
"dup_2": Series([5, 6, 7]),
9499
}
95100
),
96101
id="duplicate-names-ordered-first",
@@ -100,15 +105,15 @@
100105
# second 'dup' is ordered categorical, third 'dup' is ordered categorical
101106
DataFrame(
102107
{
103-
"dup": Series(["a", "b", "c"]), # non-categorical (object)
104-
"dup": Series(
108+
"dup_1": Series(["a", "b", "c"]), # non-categorical (object)
109+
"dup_2": Series(
105110
Categorical(
106111
["p", "q", None],
107112
categories=["p", "q"],
108113
ordered=True,
109114
)
110115
),
111-
"dup": Series(
116+
"dup_3": Series(
112117
Categorical(
113118
["low", "m", "h"],
114119
categories=["low", "m", "h"],
@@ -121,16 +126,21 @@
121126
{
122127
# First stays object; second turns into codes [0, 1, NaN]
123128
# and third changes into codes [0, 1, 2]
124-
"dup": Series(["a", "b", "c"]),
125-
"dup": Series([0.0, 1.0, np.nan]),
126-
"dup": Series([0, 1, 2], dtype="int8"),
129+
"dup_1": Series(["a", "b", "c"]),
130+
"dup_2": Series([0.0, 1.0, np.nan]),
131+
"dup_3": Series([0, 1, 2], dtype="int8"),
127132
}
128133
),
129134
id="duplicate-names-ordered-and-non-categorical-and-none",
130135
),
131136
],
132137
)
133138
def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df):
139+
# duplicate columns creation for dup columns
140+
if "dup_1" in input_df.columns:
141+
input_df.columns = ["dup" for _ in range(len(input_df.columns))]
142+
expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]
143+
134144
out_df = transform_ord_cat_cols_to_coded_cols(input_df)
135145
assert list(out_df.columns) == list(expected_df.columns)
136146
for i, col in enumerate(out_df.columns):

pandas/tests/series/methods/test_cov_corr.py

Lines changed: 18 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -187,19 +187,19 @@ def test_corr_callable_method(self, datetime_series):
187187

188188
@pytest.mark.parametrize("method", ["kendall", "spearman"])
189189
@pytest.mark.parametrize(
190-
"ord_cat_series",
190+
"cat_series",
191191
[
192-
Series( # ordered categorical series
193-
pd.Categorical(
194-
["low", "med", "high", "very_high"],
195-
categories=["low", "med", "high", "very_high"],
192+
Series(
193+
pd.Categorical( # ordered cat series
194+
["low", "medium", "high"],
195+
categories=["low", "medium", "high"],
196196
ordered=True,
197197
)
198198
),
199-
Series( # ordered categorical series with nan and a different ranking
200-
pd.Categorical(
201-
["h", "low", "vh", None],
202-
categories=["low", "m", "h", "vh"],
199+
Series(
200+
pd.Categorical( # ordered cat series with NA
201+
["low", "medium", "high", None],
202+
categories=["low", "medium", "high"],
203203
ordered=True,
204204
)
205205
),
@@ -208,36 +208,23 @@ def test_corr_callable_method(self, datetime_series):
208208
@pytest.mark.parametrize(
209209
"other_series",
210210
[
211-
Series( # int series against which tord cat series is correlated
212-
[0, 1, 2, 3]
213-
),
214-
Series( # float series against which ord cat series is correlated
215-
[2.0, 3.0, 4.5, 6.5]
216-
),
217-
Series( # other ord cat series against which ord cat series is correlated
211+
Series( # other cat ordered series
218212
pd.Categorical(
219-
["high", "low", "very_high", "med"],
220-
categories=["low", "med", "high", "very_high"],
213+
["m", "l", "h"],
214+
categories=["l", "m", "h"],
221215
ordered=True,
222216
)
223217
),
218+
# other non cat series
219+
Series([2, 1, 3]),
224220
],
225221
)
226222
def test_corr_rank_ordered_categorical(
227223
self,
228224
method,
229-
ord_cat_series,
225+
cat_series,
230226
other_series,
231227
):
232-
stats = pytest.importorskip("scipy.stats")
233-
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
234-
ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan)
235-
236-
if other_series.dtype == "category" and other_series.cat.ordered:
237-
other_series = other_series.cat.codes.replace(-1, np.nan)
238-
239-
corr_calc = ord_cat_series.corr(other_series, method=method)
240-
corr_expected = method_scipy_func[method](
241-
ord_ser_cat_codes, other_series, nan_policy="omit"
242-
)[0]
243-
tm.assert_almost_equal(corr_calc, corr_expected)
228+
expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
229+
corr_calc = cat_series.corr(other_series, method=method)
230+
tm.assert_almost_equal(corr_calc, expected_corr[method])

0 commit comments

Comments
 (0)