Skip to content

Commit e997747

Browse files
clean up
1 parent 216475c commit e997747

File tree

3 files changed

+86
-68
lines changed

3 files changed

+86
-68
lines changed

pandas/core/frame.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12017,21 +12017,23 @@ def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame:
1201712017

1201812018
data = self.copy(deep=False)
1201912019
cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique()
12020-
single_cols = [col for col in cols_convert if isinstance(data[col], Series)]
12021-
duplicated_cols = [
12020+
ser_generating_cols = [
12021+
col for col in cols_convert if isinstance(data[col], Series)
12022+
]
12023+
df_generating_cols = [
1202212024
col for col in cols_convert if isinstance(data[col], DataFrame)
1202312025
]
1202412026

12025-
if not single_cols and not duplicated_cols:
12027+
if not ser_generating_cols and not df_generating_cols:
1202612028
return self
1202712029

12028-
if single_cols:
12029-
data[single_cols] = data[single_cols].apply(
12030+
if ser_generating_cols:
12031+
data[ser_generating_cols] = data[ser_generating_cols].apply(
1203012032
lambda x: x.cat.codes.replace(-1, np.nan)
1203112033
)
1203212034

12033-
if duplicated_cols:
12034-
data[duplicated_cols] = data[duplicated_cols].apply(
12035+
for df_col in df_generating_cols:
12036+
data[df_col] = data[df_col].apply(
1203512037
lambda x: x.cat.codes.replace(-1, np.nan)
1203612038
if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered)
1203712039
else x

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,39 @@ def test_corr_rank_ordered_categorical(
294294
corr_expected = df[col1].corr(df[col2], method=method)
295295
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)
296296

297+
@pytest.mark.parametrize("method", ["kendall", "spearman"])
298+
@td.skip_if_no("scipy")
299+
def test_corr_rank_ordered_categorical_duplicate_columns(
300+
self,
301+
method,
302+
):
303+
df = DataFrame(
304+
{
305+
"a": [1, 2, 3, 4],
306+
"b": [4, 3, 2, 1],
307+
"c": [4, 3, 2, 1],
308+
"d": [10, 20, 30, 40],
309+
"e": [100, 200, 300, 400],
310+
}
311+
)
312+
df["a"] = (
313+
df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
314+
)
315+
df["b"] = (
316+
df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
317+
)
318+
df["c"] = (
319+
df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
320+
)
321+
df.columns = ["a", "a", "c", "c", "e"]
322+
323+
corr_calc = df.corr(method=method)
324+
for col1_idx, col2_idx in combinations(range(len(df.columns)), r=2):
325+
corr_expected = df.iloc[:, col1_idx].corr(
326+
df.iloc[:, col2_idx], method=method
327+
)
328+
tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected)
329+
297330

298331
class TestDataFrameCorrWith:
299332
@pytest.mark.parametrize(

pandas/tests/series/methods/test_cov_corr.py

Lines changed: 44 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -186,75 +186,58 @@ def test_corr_callable_method(self, datetime_series):
186186
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)
187187

188188
@pytest.mark.parametrize("method", ["kendall", "spearman"])
189+
@pytest.mark.parametrize(
190+
"ord_cat_series",
191+
[
192+
Series( # ordered categorical series
193+
pd.Categorical(
194+
["low", "med", "high", "very_high"],
195+
categories=["low", "med", "high", "very_high"],
196+
ordered=True,
197+
)
198+
),
199+
Series( # ordered categorical series with nan and a different ranking
200+
pd.Categorical(
201+
["h", "low", "vh", None],
202+
categories=["low", "m", "h", "vh"],
203+
ordered=True,
204+
)
205+
),
206+
],
207+
)
208+
@pytest.mark.parametrize(
209+
"other_series",
210+
[
211+
Series( # int series against which tord cat series is correlated
212+
[0, 1, 2, 3]
213+
),
214+
Series( # float series against which ord cat series is correlated
215+
[2.0, 3.0, 4.5, 6.5]
216+
),
217+
Series( # other ord cat series against which ord cat series is correlated
218+
pd.Categorical(
219+
["high", "low", "very_high", "med"],
220+
categories=["low", "med", "high", "very_high"],
221+
ordered=True,
222+
)
223+
),
224+
],
225+
)
189226
def test_corr_rank_ordered_categorical(
190227
self,
191228
method,
229+
ord_cat_series,
230+
other_series,
192231
):
193232
stats = pytest.importorskip("scipy.stats")
194233
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
195-
ser_ord_cat = Series(
196-
pd.Categorical(
197-
["low", "med", "high", "very_high"],
198-
categories=["low", "med", "high", "very_high"],
199-
ordered=True,
200-
)
201-
)
202-
ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan)
203-
ser_ord_int = Series([0, 1, 2, 3])
204-
ser_ord_float = Series([2.0, 3.0, 4.5, 6.5])
205-
206-
corr_calc = ser_ord_cat.corr(ser_ord_int, method=method)
207-
corr_expected = method_scipy_func[method](
208-
ser_ord_cat_codes, ser_ord_int, nan_policy="omit"
209-
)[0]
210-
tm.assert_almost_equal(corr_calc, corr_expected)
211-
212-
corr_calc = ser_ord_cat.corr(ser_ord_float, method=method)
213-
corr_expected = method_scipy_func[method](
214-
ser_ord_cat_codes, ser_ord_float, nan_policy="omit"
215-
)[0]
216-
tm.assert_almost_equal(corr_calc, corr_expected)
217-
218-
corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method)
219-
corr_expected = method_scipy_func[method](
220-
ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit"
221-
)[0]
222-
tm.assert_almost_equal(corr_calc, corr_expected)
223-
224-
ser_ord_cat_shuff = Series(
225-
pd.Categorical(
226-
["high", "low", "very_high", "med"],
227-
categories=["low", "med", "high", "very_high"],
228-
ordered=True,
229-
)
230-
)
231-
ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan)
232-
233-
corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method)
234-
corr_expected = method_scipy_func[method](
235-
ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit"
236-
)[0]
237-
tm.assert_almost_equal(corr_calc, corr_expected)
234+
ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan)
238235

239-
corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method)
240-
corr_expected = method_scipy_func[method](
241-
ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit"
242-
)[0]
243-
tm.assert_almost_equal(corr_calc, corr_expected)
236+
if other_series.dtype == "category" and other_series.cat.ordered:
237+
other_series = other_series.cat.codes.replace(-1, np.nan)
244238

245-
ser_ord_cat_with_nan = Series(
246-
pd.Categorical(
247-
["h", "low", "vh", None, "m"],
248-
categories=["low", "m", "h", "vh"],
249-
ordered=True,
250-
)
251-
)
252-
ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(
253-
-1, np.nan
254-
)
255-
ser_ord_int = Series([2, 0, 1, 3, None])
256-
corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method)
239+
corr_calc = ord_cat_series.corr(other_series, method=method)
257240
corr_expected = method_scipy_func[method](
258-
ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit"
241+
ord_ser_cat_codes, other_series, nan_policy="omit"
259242
)[0]
260243
tm.assert_almost_equal(corr_calc, corr_expected)

0 commit comments

Comments
 (0)