Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
import numpy as np
import pytest
from pandas._libs import index as libindex
from pandas.errors import SettingWithCopyError
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
MultiIndex,
Series,
)
import pandas._testing as tm
def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write):
# Inplace ops, originally from:
# https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug
a = [12, 23]
b = [123, None]
c = [1234, 2345]
d = [12345, 23456]
tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")]
events = {
("eyes", "left"): a,
("eyes", "right"): b,
("ears", "left"): c,
("ears", "right"): d,
}
multiind = MultiIndex.from_tuples(tuples, names=["part", "side"])
zed = DataFrame(events, index=["a", "b"], columns=multiind)
if using_copy_on_write:
with tm.raises_chained_assignment_error():
zed["eyes"]["right"].fillna(value=555, inplace=True)
elif warn_copy_on_write:
with tm.assert_produces_warning(None):
zed["eyes"]["right"].fillna(value=555, inplace=True)
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(SettingWithCopyError, match=msg):
with tm.assert_produces_warning(None):
zed["eyes"]["right"].fillna(value=555, inplace=True)
@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view
def test_cache_updating(using_copy_on_write, warn_copy_on_write):
# 5216
# make sure that we don't try to set a dead cache
a = np.random.default_rng(2).random((10, 3))
df = DataFrame(a, columns=["x", "y", "z"])
df_original = df.copy()
tuples = [(i, j) for i in range(5) for j in range(2)]
index = MultiIndex.from_tuples(tuples)
df.index = index
# setting via chained assignment
# but actually works, since everything is a view
with tm.raises_chained_assignment_error():
df.loc[0]["z"].iloc[0] = 1.0
if using_copy_on_write:
assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"]
else:
result = df.loc[(0, 0), "z"]
assert result == 1
# correct setting
df.loc[(0, 0), "z"] = 2
result = df.loc[(0, 0), "z"]
assert result == 2
def test_indexer_caching(monkeypatch):
# GH5727
# make sure that indexers are in the _internal_names_set
size_cutoff = 20
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
index = MultiIndex.from_arrays([np.arange(size_cutoff), np.arange(size_cutoff)])
s = Series(np.zeros(size_cutoff), index=index)
# setitem
s[s == 0] = 1
expected = Series(np.ones(size_cutoff), index=index)
tm.assert_series_equal(s, expected)

View File

@@ -0,0 +1,50 @@
from datetime import datetime
import numpy as np
from pandas import (
DataFrame,
Index,
MultiIndex,
Period,
Series,
period_range,
to_datetime,
)
import pandas._testing as tm
def test_multiindex_period_datetime():
# GH4861, using datetime in period of multiindex raises exception
idx1 = Index(["a", "a", "a", "b", "b"])
idx2 = period_range("2012-01", periods=len(idx1), freq="M")
s = Series(np.random.default_rng(2).standard_normal(len(idx1)), [idx1, idx2])
# try Period as index
expected = s.iloc[0]
result = s.loc["a", Period("2012-01")]
assert result == expected
# try datetime as index
result = s.loc["a", datetime(2012, 1, 1)]
assert result == expected
def test_multiindex_datetime_columns():
# GH35015, using datetime as column indices raises exception
mi = MultiIndex.from_tuples(
[(to_datetime("02/29/2020"), to_datetime("03/01/2020"))], names=["a", "b"]
)
df = DataFrame([], columns=mi)
expected_df = DataFrame(
[],
columns=MultiIndex.from_arrays(
[[to_datetime("02/29/2020")], [to_datetime("03/01/2020")]], names=["a", "b"]
),
)
tm.assert_frame_equal(df, expected_df)

View File

@@ -0,0 +1,410 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
from pandas.core.indexing import IndexingError
# ----------------------------------------------------------------------------
# test indexing of Series with multi-level Index
# ----------------------------------------------------------------------------
@pytest.mark.parametrize(
"access_method",
[lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)],
)
@pytest.mark.parametrize(
"level1_value, expected",
[(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))],
)
def test_series_getitem_multiindex(access_method, level1_value, expected):
# GH 6018
# series regression getitem with a multi-index
mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"])
ser = Series([1, 2, 3], index=mi)
expected.index.name = "A"
result = access_method(ser, level1_value)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("level0_value", ["D", "A"])
def test_series_getitem_duplicates_multiindex(level0_value):
# GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
# the appropriate error, only in PY3 of course!
index = MultiIndex(
levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]],
codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
names=["tag", "day"],
)
arr = np.random.default_rng(2).standard_normal((len(index), 1))
df = DataFrame(arr, index=index, columns=["val"])
# confirm indexing on missing value raises KeyError
if level0_value != "A":
with pytest.raises(KeyError, match=r"^'A'$"):
df.val["A"]
with pytest.raises(KeyError, match=r"^'X'$"):
df.val["X"]
result = df.val[level0_value]
expected = Series(
arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day")
)
tm.assert_series_equal(result, expected)
def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer_sl):
s = multiindex_year_month_day_dataframe_random_data["A"]
expected = s.reindex(s.index[42:65])
expected.index = expected.index.droplevel(0).droplevel(0)
result = indexer_sl(s)[2000, 3]
tm.assert_series_equal(result, expected)
def test_series_getitem_returns_scalar(
multiindex_year_month_day_dataframe_random_data, indexer_sl
):
s = multiindex_year_month_day_dataframe_random_data["A"]
expected = s.iloc[49]
result = indexer_sl(s)[2000, 3, 10]
assert result == expected
@pytest.mark.parametrize(
"indexer,expected_error,expected_error_msg",
[
(lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"),
(lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
(lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
(lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"),
(lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s)
(lambda s: s[len(s)], KeyError, ""), # match should include len(s)
(
lambda s: s.iloc[len(s)],
IndexError,
"single positional indexer is out-of-bounds",
),
],
)
def test_series_getitem_indexing_errors(
multiindex_year_month_day_dataframe_random_data,
indexer,
expected_error,
expected_error_msg,
):
s = multiindex_year_month_day_dataframe_random_data["A"]
with pytest.raises(expected_error, match=expected_error_msg):
indexer(s)
def test_series_getitem_corner_generator(
multiindex_year_month_day_dataframe_random_data,
):
s = multiindex_year_month_day_dataframe_random_data["A"]
result = s[(x > 0 for x in s)]
expected = s[s > 0]
tm.assert_series_equal(result, expected)
# ----------------------------------------------------------------------------
# test indexing of DataFrame with multi-level Index
# ----------------------------------------------------------------------------
def test_getitem_simple(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data.T
expected = df.values[:, 0]
result = df["foo", "one"].values
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize(
"indexer,expected_error_msg",
[
(lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"),
(lambda df: df["foobar"], r"^'foobar'$"),
],
)
def test_frame_getitem_simple_key_error(
multiindex_dataframe_random_data, indexer, expected_error_msg
):
df = multiindex_dataframe_random_data.T
with pytest.raises(KeyError, match=expected_error_msg):
indexer(df)
def test_tuple_string_column_names():
# GH#50372
mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")])
df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi)
df["single_index"] = 0
df_flat = df.copy()
df_flat.columns = df_flat.columns.to_flat_index()
df_flat["new_single_index"] = 0
result = df_flat[[("a", "aa"), "new_single_index"]]
expected = DataFrame(
[[0, 0], [1, 0], [2, 0]], columns=Index([("a", "aa"), "new_single_index"])
)
tm.assert_frame_equal(result, expected)
def test_frame_getitem_multicolumn_empty_level():
df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]})
df.columns = [
["level1 item1", "level1 item2"],
["", "level2 item2"],
["level3 item1", "level3 item2"],
]
result = df["level1 item1"]
expected = DataFrame(
[["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"indexer,expected_slice",
[
(lambda df: df["foo"], slice(3)),
(lambda df: df["bar"], slice(3, 5)),
(lambda df: df.loc[:, "bar"], slice(3, 5)),
],
)
def test_frame_getitem_toplevel(
multiindex_dataframe_random_data, indexer, expected_slice
):
df = multiindex_dataframe_random_data.T
expected = df.reindex(columns=df.columns[expected_slice])
expected.columns = expected.columns.droplevel(0)
result = indexer(df)
tm.assert_frame_equal(result, expected)
def test_frame_mixed_depth_get():
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
result = df["a"]
expected = df["a", "", ""].rename("a")
tm.assert_series_equal(result, expected)
result = df["routine1", "result1"]
expected = df["routine1", "result1", ""]
expected = expected.rename(("routine1", "result1"))
tm.assert_series_equal(result, expected)
def test_frame_getitem_nan_multiindex(nulls_fixture):
# GH#29751
# loc on a multiindex containing nan values
n = nulls_fixture # for code readability
cols = ["a", "b", "c"]
df = DataFrame(
[[11, n, 13], [21, n, 23], [31, n, 33], [41, n, 43]],
columns=cols,
).set_index(["a", "b"])
df["c"] = df["c"].astype("int64")
idx = (21, n)
result = df.loc[:idx]
expected = DataFrame([[11, n, 13], [21, n, 23]], columns=cols).set_index(["a", "b"])
expected["c"] = expected["c"].astype("int64")
tm.assert_frame_equal(result, expected)
result = df.loc[idx:]
expected = DataFrame(
[[21, n, 23], [31, n, 33], [41, n, 43]], columns=cols
).set_index(["a", "b"])
expected["c"] = expected["c"].astype("int64")
tm.assert_frame_equal(result, expected)
idx1, idx2 = (21, n), (31, n)
result = df.loc[idx1:idx2]
expected = DataFrame([[21, n, 23], [31, n, 33]], columns=cols).set_index(["a", "b"])
expected["c"] = expected["c"].astype("int64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"indexer,expected",
[
(
(["b"], ["bar", np.nan]),
(
DataFrame(
[[2, 3], [5, 6]],
columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
dtype="int64",
)
),
),
(
(["a", "b"]),
(
DataFrame(
[[1, 2, 3], [4, 5, 6]],
columns=MultiIndex.from_tuples(
[("a", "foo"), ("b", "bar"), ("b", np.nan)]
),
dtype="int64",
)
),
),
(
(["b"]),
(
DataFrame(
[[2, 3], [5, 6]],
columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
dtype="int64",
)
),
),
(
(["b"], ["bar"]),
(
DataFrame(
[[2], [5]],
columns=MultiIndex.from_tuples([("b", "bar")]),
dtype="int64",
)
),
),
(
(["b"], [np.nan]),
(
DataFrame(
[[3], [6]],
columns=MultiIndex(
codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]]
),
dtype="int64",
)
),
),
(("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))),
],
)
def test_frame_getitem_nan_cols_multiindex(
indexer,
expected,
nulls_fixture,
):
# Slicing MultiIndex including levels with nan values, for more information
# see GH#25154
df = DataFrame(
[[1, 2, 3], [4, 5, 6]],
columns=MultiIndex.from_tuples(
[("a", "foo"), ("b", "bar"), ("b", nulls_fixture)]
),
dtype="int64",
)
result = df.loc[:, indexer]
tm.assert_equal(result, expected)
# ----------------------------------------------------------------------------
# test indexing of DataFrame with multi-level Index with duplicates
# ----------------------------------------------------------------------------
@pytest.fixture
def dataframe_with_duplicate_index():
"""Fixture for DataFrame used in tests for gh-4145 and gh-4146"""
data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]]
index = ["h1", "h3", "h5"]
columns = MultiIndex(
levels=[["A", "B"], ["A1", "A2", "B1", "B2"]],
codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]],
names=["main", "sub"],
)
return DataFrame(data, index=index, columns=columns)
@pytest.mark.parametrize(
"indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]]
)
def test_frame_mi_access(dataframe_with_duplicate_index, indexer):
# GH 4145
df = dataframe_with_duplicate_index
index = Index(["h1", "h3", "h5"])
columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"])
expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T
result = indexer(df)
tm.assert_frame_equal(result, expected)
def test_frame_mi_access_returns_series(dataframe_with_duplicate_index):
# GH 4146, not returning a block manager when selecting a unique index
# from a duplicate index
# as of 4879, this returns a Series (which is similar to what happens
# with a non-unique)
df = dataframe_with_duplicate_index
expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1")
result = df["A"]["A1"]
tm.assert_series_equal(result, expected)
def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index):
# selecting a non_unique from the 2nd level
df = dataframe_with_duplicate_index
expected = DataFrame(
[["d", 4, 4], ["e", 5, 5]],
index=Index(["B2", "B2"], name="sub"),
columns=["h1", "h3", "h5"],
).T
result = df["A"]["B2"]
tm.assert_frame_equal(result, expected)
def test_frame_mi_empty_slice():
# GH 15454
df = DataFrame(0, index=range(2), columns=MultiIndex.from_product([[1], [2]]))
result = df[[]]
expected = DataFrame(
index=[0, 1], columns=MultiIndex(levels=[[1], [2]], codes=[[], []])
)
tm.assert_frame_equal(result, expected)
def test_loc_empty_multiindex():
# GH#36936
arrays = [["a", "a", "b", "a"], ["a", "a", "b", "b"]]
index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
df = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
# loc on empty multiindex == loc with False mask
empty_multiindex = df.loc[df.loc[:, "value"] == 0, :].index
result = df.loc[empty_multiindex, :]
expected = df.loc[[False] * len(df.index), :]
tm.assert_frame_equal(result, expected)
# replacing value with loc on empty multiindex
df.loc[df.loc[df.loc[:, "value"] == 0].index, "value"] = 5
result = df
expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,171 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
MultiIndex,
Series,
)
import pandas._testing as tm
@pytest.fixture
def simple_multiindex_dataframe():
"""
Factory function to create simple 3 x 3 dataframe with
both columns and row MultiIndex using supplied data or
random data by default.
"""
data = np.random.default_rng(2).standard_normal((3, 3))
return DataFrame(
data, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]]
)
@pytest.mark.parametrize(
"indexer, expected",
[
(
lambda df: df.iloc[0],
lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8)),
),
(
lambda df: df.iloc[2],
lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12)),
),
(
lambda df: df.iloc[:, 2],
lambda arr: Series(arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10)),
),
],
)
def test_iloc_returns_series(indexer, expected, simple_multiindex_dataframe):
df = simple_multiindex_dataframe
arr = df.values
result = indexer(df)
expected = expected(arr)
tm.assert_series_equal(result, expected)
def test_iloc_returns_dataframe(simple_multiindex_dataframe):
df = simple_multiindex_dataframe
result = df.iloc[[0, 1]]
expected = df.xs(4, drop_level=False)
tm.assert_frame_equal(result, expected)
def test_iloc_returns_scalar(simple_multiindex_dataframe):
df = simple_multiindex_dataframe
arr = df.values
result = df.iloc[2, 2]
expected = arr[2, 2]
assert result == expected
def test_iloc_getitem_multiple_items():
# GH 5528
tup = zip(*[["a", "a", "b", "b"], ["x", "y", "x", "y"]])
index = MultiIndex.from_tuples(tup)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), index=index)
result = df.iloc[[2, 3]]
expected = df.xs("b", drop_level=False)
tm.assert_frame_equal(result, expected)
def test_iloc_getitem_labels():
# this is basically regular indexing
arr = np.random.default_rng(2).standard_normal((4, 3))
df = DataFrame(
arr,
columns=[["i", "i", "j"], ["A", "A", "B"]],
index=[["i", "i", "j", "k"], ["X", "X", "Y", "Y"]],
)
result = df.iloc[2, 2]
expected = arr[2, 2]
assert result == expected
def test_frame_getitem_slice(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
result = df.iloc[:4]
expected = df[:4]
tm.assert_frame_equal(result, expected)
def test_frame_setitem_slice(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
df.iloc[:4] = 0
assert (df.values[:4] == 0).all()
assert (df.values[4:] != 0).all()
def test_indexing_ambiguity_bug_1678():
# GH 1678
columns = MultiIndex.from_tuples(
[("Ohio", "Green"), ("Ohio", "Red"), ("Colorado", "Green")]
)
index = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns)
result = df.iloc[:, 1]
expected = df.loc[:, ("Ohio", "Red")]
tm.assert_series_equal(result, expected)
def test_iloc_integer_locations():
# GH 13797
data = [
["str00", "str01"],
["str10", "str11"],
["str20", "srt21"],
["str30", "str31"],
["str40", "str41"],
]
index = MultiIndex.from_tuples(
[("CC", "A"), ("CC", "B"), ("CC", "B"), ("BB", "a"), ("BB", "b")]
)
expected = DataFrame(data)
df = DataFrame(data, index=index)
result = DataFrame([[df.iloc[r, c] for c in range(2)] for r in range(5)])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, indexes, values, expected_k",
[
# test without indexer value in first level of MultiIndex
([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]),
# test like code sample 1 in the issue
([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], [755, 1066]),
# test like code sample 2 in the issue
([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]),
# test like code sample 3 in the issue
([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], [8, 15, 13]),
],
)
def test_iloc_setitem_int_multiindex_series(data, indexes, values, expected_k):
# GH17148
df = DataFrame(data=data, columns=["i", "j", "k"])
df = df.set_index(["i", "j"])
series = df.k.copy()
for i, v in zip(indexes, values):
series.iloc[i] += v
df["k"] = expected_k
expected = df.k
tm.assert_series_equal(series, expected)
def test_getitem_iloc(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
result = df.iloc[2]
expected = df.xs(df.index[2])
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,118 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
@pytest.fixture
def m():
return 5
@pytest.fixture
def n():
return 100
@pytest.fixture
def cols():
return ["jim", "joe", "jolie", "joline", "jolia"]
@pytest.fixture
def vals(n):
vals = [
np.random.default_rng(2).integers(0, 10, n),
np.random.default_rng(2).choice(list("abcdefghij"), n),
np.random.default_rng(2).choice(
pd.date_range("20141009", periods=10).tolist(), n
),
np.random.default_rng(2).choice(list("ZYXWVUTSRQ"), n),
np.random.default_rng(2).standard_normal(n),
]
vals = list(map(tuple, zip(*vals)))
return vals
@pytest.fixture
def keys(n, m, vals):
# bunch of keys for testing
keys = [
np.random.default_rng(2).integers(0, 11, m),
np.random.default_rng(2).choice(list("abcdefghijk"), m),
np.random.default_rng(2).choice(
pd.date_range("20141009", periods=11).tolist(), m
),
np.random.default_rng(2).choice(list("ZYXWVUTSRQP"), m),
]
keys = list(map(tuple, zip(*keys)))
keys += [t[:-1] for t in vals[:: n // m]]
return keys
# covers both unique index and non-unique index
@pytest.fixture
def df(vals, cols):
return DataFrame(vals, columns=cols)
@pytest.fixture
def a(df):
return pd.concat([df, df])
@pytest.fixture
def b(df, cols):
return df.drop_duplicates(subset=cols[:-1])
@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
@pytest.mark.parametrize("lexsort_depth", list(range(5)))
@pytest.mark.parametrize("frame_fixture", ["a", "b"])
def test_multiindex_get_loc(request, lexsort_depth, keys, frame_fixture, cols):
# GH7724, GH2646
frame = request.getfixturevalue(frame_fixture)
if lexsort_depth == 0:
df = frame.copy(deep=False)
else:
df = frame.sort_values(by=cols[:lexsort_depth])
mi = df.set_index(cols[:-1])
assert not mi.index._lexsort_depth < lexsort_depth
for key in keys:
mask = np.ones(len(df), dtype=bool)
# test for all partials of this key
for i, k in enumerate(key):
mask &= df.iloc[:, i] == k
if not mask.any():
assert key[: i + 1] not in mi.index
continue
assert key[: i + 1] in mi.index
right = df[mask].copy(deep=False)
if i + 1 != len(key): # partial key
return_value = right.drop(cols[: i + 1], axis=1, inplace=True)
assert return_value is None
return_value = right.set_index(cols[i + 1 : -1], inplace=True)
assert return_value is None
tm.assert_frame_equal(mi.loc[key[: i + 1]], right)
else: # full key
return_value = right.set_index(cols[:-1], inplace=True)
assert return_value is None
if len(right) == 1: # single hit
right = Series(
right["jolia"].values, name=right.index[0], index=["jolia"]
)
tm.assert_series_equal(mi.loc[key[: i + 1]], right)
else: # multi hit
tm.assert_frame_equal(mi.loc[key[: i + 1]], right)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,235 @@
import numpy as np
import pytest
import pandas._libs.index as libindex
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
CategoricalDtype,
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
from pandas.core.arrays.boolean import BooleanDtype
class TestMultiIndexBasic:
def test_multiindex_perf_warn(self):
df = DataFrame(
{
"jim": [0, 0, 1, 1],
"joe": ["x", "x", "z", "y"],
"jolie": np.random.default_rng(2).random(4),
}
).set_index(["jim", "joe"])
with tm.assert_produces_warning(PerformanceWarning):
df.loc[(1, "z")]
df = df.iloc[[2, 1, 3, 0]]
with tm.assert_produces_warning(PerformanceWarning):
df.loc[(0,)]
@pytest.mark.parametrize("offset", [-5, 5])
def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset):
size_cutoff = 20
n = size_cutoff + offset
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
# hai it works!
assert s[("a", 5)] == 5
assert s[("a", 6)] == 6
assert s[("a", 7)] == 7
def test_multi_nan_indexing(self):
# GH 3588
df = DataFrame(
{
"a": ["R1", "R2", np.nan, "R4"],
"b": ["C1", "C2", "C3", "C4"],
"c": [10, 15, np.nan, 20],
}
)
result = df.set_index(["a", "b"], drop=False)
expected = DataFrame(
{
"a": ["R1", "R2", np.nan, "R4"],
"b": ["C1", "C2", "C3", "C4"],
"c": [10, 15, np.nan, 20],
},
index=[
Index(["R1", "R2", np.nan, "R4"], name="a"),
Index(["C1", "C2", "C3", "C4"], name="b"),
],
)
tm.assert_frame_equal(result, expected)
def test_exclusive_nat_column_indexing(self):
# GH 38025
# test multi indexing when one column exclusively contains NaT values
df = DataFrame(
{
"a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT],
"b": ["C1", "C2", "C3", "C4"],
"c": [10, 15, np.nan, 20],
}
)
df = df.set_index(["a", "b"])
expected = DataFrame(
{
"c": [10, 15, np.nan, 20],
},
index=[
Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"),
Index(["C1", "C2", "C3", "C4"], name="b"),
],
)
tm.assert_frame_equal(df, expected)
def test_nested_tuples_duplicates(self):
# GH#30892
dti = pd.to_datetime(["20190101", "20190101", "20190102"])
idx = Index(["a", "a", "c"])
mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"])
df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi)
expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi)
df2 = df.copy(deep=True)
df2.loc[(dti[0], "a"), "c2"] = 1.0
tm.assert_frame_equal(df2, expected)
df3 = df.copy(deep=True)
df3.loc[[(dti[0], "a")], "c2"] = 1.0
tm.assert_frame_equal(df3, expected)
def test_multiindex_with_datatime_level_preserves_freq(self):
# https://github.com/pandas-dev/pandas/issues/35563
idx = Index(range(2), name="A")
dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
mi = MultiIndex.from_product([idx, dti])
df = DataFrame(np.random.default_rng(2).standard_normal((14, 2)), index=mi)
result = df.loc[0].index
tm.assert_index_equal(result, dti)
assert result.freq == dti.freq
def test_multiindex_complex(self):
# GH#42145
complex_data = [1 + 2j, 4 - 3j, 10 - 1j]
non_complex_data = [3, 4, 5]
result = DataFrame(
{
"x": complex_data,
"y": non_complex_data,
"z": non_complex_data,
}
)
result.set_index(["x", "y"], inplace=True)
expected = DataFrame(
{"z": non_complex_data},
index=MultiIndex.from_arrays(
[complex_data, non_complex_data],
names=("x", "y"),
),
)
tm.assert_frame_equal(result, expected)
def test_rename_multiindex_with_duplicates(self):
# GH 38015
mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")])
df = DataFrame(index=mi)
df = df.rename(index={"A": "Apple"}, level=0)
mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")])
expected = DataFrame(index=mi2)
tm.assert_frame_equal(df, expected)
def test_series_align_multiindex_with_nan_overlap_only(self):
# GH 38439
mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]])
ser1 = Series([1, 2], index=mi1)
ser2 = Series([1, 2], index=mi2)
result1, result2 = ser1.align(ser2)
mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]])
expected1 = Series([1.0, np.nan, 2.0], index=mi)
expected2 = Series([np.nan, 2.0, 1.0], index=mi)
tm.assert_series_equal(result1, expected1)
tm.assert_series_equal(result2, expected2)
def test_series_align_multiindex_with_nan(self):
# GH 38439
mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]])
ser1 = Series([1, 2], index=mi1)
ser2 = Series([1, 2], index=mi2)
result1, result2 = ser1.align(ser2)
mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
expected1 = Series([1, 2], index=mi)
expected2 = Series([2, 1], index=mi)
tm.assert_series_equal(result1, expected1)
tm.assert_series_equal(result2, expected2)
def test_nunique_smoke(self):
# GH 34019
n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique()
assert n == 1
def test_multiindex_repeated_keys(self):
# GH19414
tm.assert_series_equal(
Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[
["a", "a", "b", "b"]
],
Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
)
def test_multiindex_with_na_missing_key(self):
# GH46173
df = DataFrame.from_dict(
{
("foo",): [1, 2, 3],
("bar",): [5, 6, 7],
(None,): [8, 9, 0],
}
)
with pytest.raises(KeyError, match="missing_key"):
df[[("missing_key",)]]
def test_multiindex_dtype_preservation(self):
# GH51261
columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"])
df = DataFrame(["value"], columns=columns).astype("category")
df_no_multiindex = df["A"]
assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype)
# geopandas 1763 analogue
df = DataFrame(
[[1, 0], [0, 1]],
columns=[
["foo", "foo"],
["location", "location"],
["x", "y"],
],
).assign(bools=Series([True, False], dtype="boolean"))
assert isinstance(df["bools"].dtype, BooleanDtype)
def test_multiindex_from_tuples_with_nan(self):
# GH#23578
result = MultiIndex.from_tuples([("a", "b", "c"), np.nan, ("d", "", "")])
expected = MultiIndex.from_tuples(
[("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")]
)
tm.assert_index_equal(result, expected)

View File

@@ -0,0 +1,269 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
DatetimeIndex,
MultiIndex,
date_range,
)
import pandas._testing as tm
class TestMultiIndexPartial:
def test_getitem_partial_int(self):
# GH 12416
# with single item
l1 = [10, 20]
l2 = ["a", "b"]
df = DataFrame(index=range(2), columns=MultiIndex.from_product([l1, l2]))
expected = DataFrame(index=range(2), columns=l2)
result = df[20]
tm.assert_frame_equal(result, expected)
# with list
expected = DataFrame(
index=range(2), columns=MultiIndex.from_product([l1[1:], l2])
)
result = df[[20]]
tm.assert_frame_equal(result, expected)
# missing item:
with pytest.raises(KeyError, match="1"):
df[1]
with pytest.raises(KeyError, match=r"'\[1\] not in index'"):
df[[1]]
def test_series_slice_partial(self):
pass
def test_xs_partial(
self,
multiindex_dataframe_random_data,
multiindex_year_month_day_dataframe_random_data,
):
frame = multiindex_dataframe_random_data
ymd = multiindex_year_month_day_dataframe_random_data
result = frame.xs("foo")
result2 = frame.loc["foo"]
expected = frame.T["foo"].T
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, result2)
result = ymd.xs((2000, 4))
expected = ymd.loc[2000, 4]
tm.assert_frame_equal(result, expected)
# ex from #1796
index = MultiIndex(
levels=[["foo", "bar"], ["one", "two"], [-1, 1]],
codes=[
[0, 0, 0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 0, 0, 1, 1],
[0, 1, 0, 1, 0, 1, 0, 1],
],
)
df = DataFrame(
np.random.default_rng(2).standard_normal((8, 4)),
index=index,
columns=list("abcd"),
)
result = df.xs(("foo", "one"))
expected = df.loc["foo", "one"]
tm.assert_frame_equal(result, expected)
def test_getitem_partial(self, multiindex_year_month_day_dataframe_random_data):
ymd = multiindex_year_month_day_dataframe_random_data
ymd = ymd.T
result = ymd[2000, 2]
expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1])
expected.columns = expected.columns.droplevel(0).droplevel(0)
tm.assert_frame_equal(result, expected)
def test_fancy_slice_partial(
self,
multiindex_dataframe_random_data,
multiindex_year_month_day_dataframe_random_data,
):
frame = multiindex_dataframe_random_data
result = frame.loc["bar":"baz"]
expected = frame[3:7]
tm.assert_frame_equal(result, expected)
ymd = multiindex_year_month_day_dataframe_random_data
result = ymd.loc[(2000, 2):(2000, 4)]
lev = ymd.index.codes[1]
expected = ymd[(lev >= 1) & (lev <= 3)]
tm.assert_frame_equal(result, expected)
def test_getitem_partial_column_select(self):
idx = MultiIndex(
codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]],
levels=[["a", "b"], ["x", "y"], ["p", "q"]],
)
df = DataFrame(np.random.default_rng(2).random((3, 2)), index=idx)
result = df.loc[("a", "y"), :]
expected = df.loc[("a", "y")]
tm.assert_frame_equal(result, expected)
result = df.loc[("a", "y"), [1, 0]]
expected = df.loc[("a", "y")][[1, 0]]
tm.assert_frame_equal(result, expected)
with pytest.raises(KeyError, match=r"\('a', 'foo'\)"):
df.loc[("a", "foo"), :]
# TODO(ArrayManager) rewrite test to not use .values
# exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view
@td.skip_array_manager_invalid_test
def test_partial_set(
self,
multiindex_year_month_day_dataframe_random_data,
using_copy_on_write,
warn_copy_on_write,
):
# GH #397
ymd = multiindex_year_month_day_dataframe_random_data
df = ymd.copy()
exp = ymd.copy()
df.loc[2000, 4] = 0
exp.iloc[65:85] = 0
tm.assert_frame_equal(df, exp)
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["A"].loc[2000, 4] = 1
df.loc[(2000, 4), "A"] = 1
else:
with tm.raises_chained_assignment_error():
df["A"].loc[2000, 4] = 1
exp.iloc[65:85, 0] = 1
tm.assert_frame_equal(df, exp)
df.loc[2000] = 5
exp.iloc[:100] = 5
tm.assert_frame_equal(df, exp)
# this works...for now
with tm.raises_chained_assignment_error():
df["A"].iloc[14] = 5
if using_copy_on_write:
assert df["A"].iloc[14] == exp["A"].iloc[14]
else:
assert df["A"].iloc[14] == 5
@pytest.mark.parametrize("dtype", [int, float])
def test_getitem_intkey_leading_level(
self, multiindex_year_month_day_dataframe_random_data, dtype
):
# GH#33355 dont fall-back to positional when leading level is int
ymd = multiindex_year_month_day_dataframe_random_data
levels = ymd.index.levels
ymd.index = ymd.index.set_levels([levels[0].astype(dtype)] + levels[1:])
ser = ymd["A"]
mi = ser.index
assert isinstance(mi, MultiIndex)
if dtype is int:
assert mi.levels[0].dtype == np.dtype(int)
else:
assert mi.levels[0].dtype == np.float64
assert 14 not in mi.levels[0]
assert not mi.levels[0]._should_fallback_to_positional
assert not mi._should_fallback_to_positional
with pytest.raises(KeyError, match="14"):
ser[14]
# ---------------------------------------------------------------------
def test_setitem_multiple_partial(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
expected = frame.copy()
result = frame.copy()
result.loc[["foo", "bar"]] = 0
expected.loc["foo"] = 0
expected.loc["bar"] = 0
tm.assert_frame_equal(result, expected)
expected = frame.copy()
result = frame.copy()
result.loc["foo":"bar"] = 0
expected.loc["foo"] = 0
expected.loc["bar"] = 0
tm.assert_frame_equal(result, expected)
expected = frame["A"].copy()
result = frame["A"].copy()
result.loc[["foo", "bar"]] = 0
expected.loc["foo"] = 0
expected.loc["bar"] = 0
tm.assert_series_equal(result, expected)
expected = frame["A"].copy()
result = frame["A"].copy()
result.loc["foo":"bar"] = 0
expected.loc["foo"] = 0
expected.loc["bar"] = 0
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"indexer, exp_idx, exp_values",
[
(
slice("2019-2", None),
DatetimeIndex(["2019-02-01"], dtype="M8[ns]"),
[2, 3],
),
(
slice(None, "2019-2"),
date_range("2019", periods=2, freq="MS"),
[0, 1, 2, 3],
),
],
)
def test_partial_getitem_loc_datetime(self, indexer, exp_idx, exp_values):
# GH: 25165
date_idx = date_range("2019", periods=2, freq="MS")
df = DataFrame(
list(range(4)),
index=MultiIndex.from_product([date_idx, [0, 1]], names=["x", "y"]),
)
expected = DataFrame(
exp_values,
index=MultiIndex.from_product([exp_idx, [0, 1]], names=["x", "y"]),
)
result = df[indexer]
tm.assert_frame_equal(result, expected)
result = df.loc[indexer]
tm.assert_frame_equal(result, expected)
result = df.loc(axis=0)[indexer]
tm.assert_frame_equal(result, expected)
result = df.loc[indexer, :]
tm.assert_frame_equal(result, expected)
df2 = df.swaplevel(0, 1).sort_index()
expected = expected.swaplevel(0, 1).sort_index()
result = df2.loc[:, indexer, :]
tm.assert_frame_equal(result, expected)
def test_loc_getitem_partial_both_axis():
# gh-12660
iterables = [["a", "b"], [2, 1]]
columns = MultiIndex.from_product(iterables, names=["col1", "col2"])
rows = MultiIndex.from_product(iterables, names=["row1", "row2"])
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)), index=rows, columns=columns
)
expected = df.iloc[:2, 2:].droplevel("row1").droplevel("col1", axis=1)
result = df.loc["a", "b"]
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,589 @@
import numpy as np
import pytest
from pandas.errors import SettingWithCopyError
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
MultiIndex,
Series,
date_range,
isna,
notna,
)
import pandas._testing as tm
def assert_equal(a, b):
assert a == b
class TestMultiIndexSetItem:
def check(self, target, indexers, value, compare_fn=assert_equal, expected=None):
target.loc[indexers] = value
result = target.loc[indexers]
if expected is None:
expected = value
compare_fn(result, expected)
def test_setitem_multiindex(self):
# GH#7190
cols = ["A", "w", "l", "a", "x", "X", "d", "profit"]
index = MultiIndex.from_product(
[np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"]
)
t, n = 0, 2
df = DataFrame(
np.nan,
columns=cols,
index=index,
)
self.check(target=df, indexers=((t, n), "X"), value=0)
df = DataFrame(-999, columns=cols, index=index)
self.check(target=df, indexers=((t, n), "X"), value=1)
df = DataFrame(columns=cols, index=index)
self.check(target=df, indexers=((t, n), "X"), value=2)
# gh-7218: assigning with 0-dim arrays
df = DataFrame(-999, columns=cols, index=index)
self.check(
target=df,
indexers=((t, n), "X"),
value=np.array(3),
expected=3,
)
def test_setitem_multiindex2(self):
# GH#5206
df = DataFrame(
np.arange(25).reshape(5, 5), columns="A,B,C,D,E".split(","), dtype=float
)
df["F"] = 99
row_selection = df["A"] % 2 == 0
col_selection = ["B", "C"]
df.loc[row_selection, col_selection] = df["F"]
output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"])
tm.assert_frame_equal(df.loc[row_selection, col_selection], output)
self.check(
target=df,
indexers=(row_selection, col_selection),
value=df["F"],
compare_fn=tm.assert_frame_equal,
expected=output,
)
def test_setitem_multiindex3(self):
# GH#11372
idx = MultiIndex.from_product(
[["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")]
)
cols = MultiIndex.from_product(
[["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")]
)
df = DataFrame(
np.random.default_rng(2).random((12, 4)), index=idx, columns=cols
)
subidx = MultiIndex.from_arrays(
[["A", "A"], date_range("2015-01-01", "2015-02-01", freq="MS")]
)
subcols = MultiIndex.from_arrays(
[["foo", "foo"], date_range("2016-01-01", "2016-02-01", freq="MS")]
)
vals = DataFrame(
np.random.default_rng(2).random((2, 2)), index=subidx, columns=subcols
)
self.check(
target=df,
indexers=(subidx, subcols),
value=vals,
compare_fn=tm.assert_frame_equal,
)
# set all columns
vals = DataFrame(
np.random.default_rng(2).random((2, 4)), index=subidx, columns=cols
)
self.check(
target=df,
indexers=(subidx, slice(None, None, None)),
value=vals,
compare_fn=tm.assert_frame_equal,
)
# identity
copy = df.copy()
self.check(
target=df,
indexers=(df.index, df.columns),
value=df,
compare_fn=tm.assert_frame_equal,
expected=copy,
)
# TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in
# all NaNs -> doesn't work in the "split" path (also for BlockManager actually)
@td.skip_array_manager_not_yet_implemented
def test_multiindex_setitem(self):
# GH 3738
# setting with a multi-index right hand side
arrays = [
np.array(["bar", "bar", "baz", "qux", "qux", "bar"]),
np.array(["one", "two", "one", "one", "two", "one"]),
np.arange(0, 6, 1),
]
df_orig = DataFrame(
np.random.default_rng(2).standard_normal((6, 3)),
index=arrays,
columns=["A", "B", "C"],
).sort_index()
expected = df_orig.loc[["bar"]] * 2
df = df_orig.copy()
df.loc[["bar"]] *= 2
tm.assert_frame_equal(df.loc[["bar"]], expected)
# raise because these have differing levels
msg = "cannot align on a multi-index with out specifying the join levels"
with pytest.raises(TypeError, match=msg):
df.loc["bar"] *= 2
def test_multiindex_setitem2(self):
# from SO
# https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation
df_orig = DataFrame.from_dict(
{
"price": {
("DE", "Coal", "Stock"): 2,
("DE", "Gas", "Stock"): 4,
("DE", "Elec", "Demand"): 1,
("FR", "Gas", "Stock"): 5,
("FR", "Solar", "SupIm"): 0,
("FR", "Wind", "SupIm"): 0,
}
}
)
df_orig.index = MultiIndex.from_tuples(
df_orig.index, names=["Sit", "Com", "Type"]
)
expected = df_orig.copy()
expected.iloc[[0, 1, 3]] *= 2
idx = pd.IndexSlice
df = df_orig.copy()
df.loc[idx[:, :, "Stock"], :] *= 2
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[idx[:, :, "Stock"], "price"] *= 2
tm.assert_frame_equal(df, expected)
def test_multiindex_assignment(self):
# GH3777 part 2
# mixed dtype
df = DataFrame(
np.random.default_rng(2).integers(5, 10, size=9).reshape(3, 3),
columns=list("abc"),
index=[[4, 4, 8], [8, 10, 12]],
)
df["d"] = np.nan
arr = np.array([0.0, 1.0])
df.loc[4, "d"] = arr
tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d"))
def test_multiindex_assignment_single_dtype(
self, using_copy_on_write, warn_copy_on_write
):
# GH3777 part 2b
# single dtype
arr = np.array([0.0, 1.0])
df = DataFrame(
np.random.default_rng(2).integers(5, 10, size=9).reshape(3, 3),
columns=list("abc"),
index=[[4, 4, 8], [8, 10, 12]],
dtype=np.int64,
)
view = df["c"].iloc[:2].values
# arr can be losslessly cast to int, so this setitem is inplace
# INFO(CoW-warn) this does not warn because we directly took .values
# above, so no reference to a pandas object is alive for `view`
df.loc[4, "c"] = arr
exp = Series(arr, index=[8, 10], name="c", dtype="int64")
result = df.loc[4, "c"]
tm.assert_series_equal(result, exp)
# extra check for inplace-ness
if not using_copy_on_write:
tm.assert_numpy_array_equal(view, exp.values)
# arr + 0.5 cannot be cast losslessly to int, so we upcast
with tm.assert_produces_warning(
FutureWarning, match="item of incompatible dtype"
):
df.loc[4, "c"] = arr + 0.5
result = df.loc[4, "c"]
exp = exp + 0.5
tm.assert_series_equal(result, exp)
# scalar ok
with tm.assert_cow_warning(warn_copy_on_write):
df.loc[4, "c"] = 10
exp = Series(10, index=[8, 10], name="c", dtype="float64")
tm.assert_series_equal(df.loc[4, "c"], exp)
# invalid assignments
msg = "Must have equal len keys and value when setting with an iterable"
with pytest.raises(ValueError, match=msg):
df.loc[4, "c"] = [0, 1, 2, 3]
with pytest.raises(ValueError, match=msg):
df.loc[4, "c"] = [0]
# But with a length-1 listlike column indexer this behaves like
# `df.loc[4, "c"] = 0
with tm.assert_cow_warning(warn_copy_on_write):
df.loc[4, ["c"]] = [0]
assert (df.loc[4, "c"] == 0).all()
def test_groupby_example(self):
# groupby example
NUM_ROWS = 100
NUM_COLS = 10
col_names = ["A" + num for num in map(str, np.arange(NUM_COLS).tolist())]
index_cols = col_names[:5]
df = DataFrame(
np.random.default_rng(2).integers(5, size=(NUM_ROWS, NUM_COLS)),
dtype=np.int64,
columns=col_names,
)
df = df.set_index(index_cols).sort_index()
grp = df.groupby(level=index_cols[:4])
df["new_col"] = np.nan
# we are actually operating on a copy here
# but in this case, that's ok
for name, df2 in grp:
new_vals = np.arange(df2.shape[0])
df.loc[name, "new_col"] = new_vals
def test_series_setitem(
self, multiindex_year_month_day_dataframe_random_data, warn_copy_on_write
):
ymd = multiindex_year_month_day_dataframe_random_data
s = ymd["A"]
with tm.assert_cow_warning(warn_copy_on_write):
s[2000, 3] = np.nan
assert isna(s.values[42:65]).all()
assert notna(s.values[:42]).all()
assert notna(s.values[65:]).all()
with tm.assert_cow_warning(warn_copy_on_write):
s[2000, 3, 10] = np.nan
assert isna(s.iloc[49])
with pytest.raises(KeyError, match="49"):
# GH#33355 dont fall-back to positional when leading level is int
s[49]
def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
df = frame.T.copy()
values = df.values.copy()
result = df[df > 0]
expected = df.where(df > 0)
tm.assert_frame_equal(result, expected)
df[df > 0] = 5
values[values > 0] = 5
tm.assert_almost_equal(df.values, values)
df[df == 5] = 0
values[values == 5] = 0
tm.assert_almost_equal(df.values, values)
# a df that needs alignment first
df[df[:-1] < 0] = 2
np.putmask(values[:-1], values[:-1] < 0, 2)
tm.assert_almost_equal(df.values, values)
with pytest.raises(TypeError, match="boolean values only"):
df[df * 0] = 2
def test_frame_getitem_setitem_multislice(self):
levels = [["t1", "t2"], ["a", "b", "c"]]
codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]]
midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"])
df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx)
result = df.loc[:, "value"]
tm.assert_series_equal(df["value"], result)
result = df.loc[df.index[1:3], "value"]
tm.assert_series_equal(df["value"][1:3], result)
result = df.loc[:, :]
tm.assert_frame_equal(df, result)
result = df
df.loc[:, "value"] = 10
result["value"] = 10
tm.assert_frame_equal(df, result)
df.loc[:, :] = 10
tm.assert_frame_equal(df, result)
def test_frame_setitem_multi_column(self):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=[["a", "a", "b", "b"], [0, 1, 0, 1]],
)
cp = df.copy()
cp["a"] = cp["b"]
tm.assert_frame_equal(cp["a"], cp["b"])
# set with ndarray
cp = df.copy()
cp["a"] = cp["b"].values
tm.assert_frame_equal(cp["a"], cp["b"])
def test_frame_setitem_multi_column2(self):
# ---------------------------------------
# GH#1803
columns = MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")])
df = DataFrame(index=[1, 3, 5], columns=columns)
# Works, but adds a column instead of updating the two existing ones
df["A"] = 0.0 # Doesn't work
assert (df["A"].values == 0).all()
# it broadcasts
df["B", "1"] = [1, 2, 3]
df["A"] = df["B", "1"]
sliced_a1 = df["A", "1"]
sliced_a2 = df["A", "2"]
sliced_b1 = df["B", "1"]
tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False)
tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False)
assert sliced_a1.name == ("A", "1")
assert sliced_a2.name == ("A", "2")
assert sliced_b1.name == ("B", "1")
def test_loc_getitem_tuple_plus_columns(
self, multiindex_year_month_day_dataframe_random_data
):
# GH #1013
ymd = multiindex_year_month_day_dataframe_random_data
df = ymd[:5]
result = df.loc[(2000, 1, 6), ["A", "B", "C"]]
expected = df.loc[2000, 1, 6][["A", "B", "C"]]
tm.assert_series_equal(result, expected)
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
def test_loc_getitem_setitem_slice_integers(self, frame_or_series):
index = MultiIndex(
levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]
)
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 4)),
index=index,
columns=["a", "b", "c", "d"],
)
obj = tm.get_obj(obj, frame_or_series)
res = obj.loc[1:2]
exp = obj.reindex(obj.index[2:])
tm.assert_equal(res, exp)
obj.loc[1:2] = 7
assert (obj.loc[1:2] == 7).values.all()
def test_setitem_change_dtype(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
dft = frame.T
s = dft["foo", "two"]
dft["foo", "two"] = s > s.median()
tm.assert_series_equal(dft["foo", "two"], s > s.median())
# assert isinstance(dft._data.blocks[1].items, MultiIndex)
reindexed = dft.reindex(columns=[("foo", "two")])
tm.assert_series_equal(reindexed["foo", "two"], s > s.median())
def test_set_column_scalar_with_loc(
self, multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write
):
frame = multiindex_dataframe_random_data
subset = frame.index[[1, 4, 5]]
frame.loc[subset] = 99
assert (frame.loc[subset].values == 99).all()
frame_original = frame.copy()
col = frame["B"]
with tm.assert_cow_warning(warn_copy_on_write):
col[subset] = 97
if using_copy_on_write:
# chained setitem doesn't work with CoW
tm.assert_frame_equal(frame, frame_original)
else:
assert (frame.loc[subset, "B"] == 97).all()
def test_nonunique_assignment_1750(self):
df = DataFrame(
[[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD")
)
df = df.set_index(["A", "B"])
mi = MultiIndex.from_tuples([(1, 1)])
df.loc[mi, "C"] = "_"
assert (df.xs((1, 1))["C"] == "_").all()
def test_astype_assignment_with_dups(self):
# GH 4686
# assignment with dups that has a dtype change
cols = MultiIndex.from_tuples([("A", "1"), ("B", "1"), ("A", "2")])
df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object)
index = df.index.copy()
df["A"] = df["A"].astype(np.float64)
tm.assert_index_equal(df.index, index)
def test_setitem_nonmonotonic(self):
# https://github.com/pandas-dev/pandas/issues/31449
index = MultiIndex.from_tuples(
[("a", "c"), ("b", "x"), ("a", "d")], names=["l1", "l2"]
)
df = DataFrame(data=[0, 1, 2], index=index, columns=["e"])
df.loc["a", "e"] = np.arange(99, 101, dtype="int64")
expected = DataFrame({"e": [99, 1, 100]}, index=index)
tm.assert_frame_equal(df, expected)
class TestSetitemWithExpansionMultiIndex:
def test_setitem_new_column_mixed_depth(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
result = df.copy()
expected = df.copy()
result["b"] = [1, 2, 3, 4]
expected["b", "", ""] = [1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
def test_setitem_new_column_all_na(self):
# GH#1534
mix = MultiIndex.from_tuples([("1a", "2a"), ("1a", "2b"), ("1a", "2c")])
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix)
s = Series({(1, 1): 1, (1, 2): 2})
df["new"] = s
assert df["new"].isna().all()
def test_setitem_enlargement_keep_index_names(self):
# GH#53053
mi = MultiIndex.from_tuples([(1, 2, 3)], names=["i1", "i2", "i3"])
df = DataFrame(data=[[10, 20, 30]], index=mi, columns=["A", "B", "C"])
df.loc[(0, 0, 0)] = df.loc[(1, 2, 3)]
mi_expected = MultiIndex.from_tuples(
[(1, 2, 3), (0, 0, 0)], names=["i1", "i2", "i3"]
)
expected = DataFrame(
data=[[10, 20, 30], [10, 20, 30]],
index=mi_expected,
columns=["A", "B", "C"],
)
tm.assert_frame_equal(df, expected)
@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values
# is not a view
def test_frame_setitem_view_direct(
multiindex_dataframe_random_data, using_copy_on_write
):
# this works because we are modifying the underlying array
# really a no-no
df = multiindex_dataframe_random_data.T
if using_copy_on_write:
with pytest.raises(ValueError, match="read-only"):
df["foo"].values[:] = 0
assert (df["foo"].values != 0).all()
else:
df["foo"].values[:] = 0
assert (df["foo"].values == 0).all()
def test_frame_setitem_copy_raises(
multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write
):
# will raise/warn as its chained assignment
df = multiindex_dataframe_random_data.T
if using_copy_on_write or warn_copy_on_write:
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(SettingWithCopyError, match=msg):
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
def test_frame_setitem_copy_no_write(
multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write
):
frame = multiindex_dataframe_random_data.T
expected = frame
df = frame.copy()
if using_copy_on_write or warn_copy_on_write:
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(SettingWithCopyError, match=msg):
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
result = df
tm.assert_frame_equal(result, expected)
def test_frame_setitem_partial_multiindex():
# GH 54875
df = DataFrame(
{
"a": [1, 2, 3],
"b": [3, 4, 5],
"c": 6,
"d": 7,
}
).set_index(["a", "b", "c"])
ser = Series(8, index=df.index.droplevel("c"))
result = df.copy()
result["d"] = ser
expected = df.copy()
expected["d"] = 8
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,796 @@
from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
from pandas.errors import UnsortedIndexError
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.indexing.common import _mklbl
class TestMultiIndexSlicers:
def test_per_axis_per_level_getitem(self):
# GH6134
# example test case
ix = MultiIndex.from_product(
[_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)]
)
df = DataFrame(np.arange(len(ix.to_numpy())), index=ix)
result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if a in ("A1", "A2", "A3") and c in ("C1", "C3")
]
]
tm.assert_frame_equal(result, expected)
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if a in ("A1", "A2", "A3") and c in ("C1", "C2", "C3")
]
]
result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :]
tm.assert_frame_equal(result, expected)
# test multi-index slicing with per axis and per index controls
index = MultiIndex.from_tuples(
[("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]
)
columns = MultiIndex.from_tuples(
[("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
names=["lvl0", "lvl1"],
)
df = DataFrame(
np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns
)
df = df.sort_index(axis=0).sort_index(axis=1)
# identity
result = df.loc[(slice(None), slice(None)), :]
tm.assert_frame_equal(result, df)
result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))]
tm.assert_frame_equal(result, df)
result = df.loc[:, (slice(None), slice(None))]
tm.assert_frame_equal(result, df)
# index
result = df.loc[(slice(None), [1]), :]
expected = df.iloc[[0, 3]]
tm.assert_frame_equal(result, expected)
result = df.loc[(slice(None), 1), :]
expected = df.iloc[[0, 3]]
tm.assert_frame_equal(result, expected)
# columns
result = df.loc[:, (slice(None), ["foo"])]
expected = df.iloc[:, [1, 3]]
tm.assert_frame_equal(result, expected)
# both
result = df.loc[(slice(None), 1), (slice(None), ["foo"])]
expected = df.iloc[[0, 3], [1, 3]]
tm.assert_frame_equal(result, expected)
result = df.loc["A", "a"]
expected = DataFrame(
{"bar": [1, 5, 9], "foo": [0, 4, 8]},
index=Index([1, 2, 3], name="two"),
columns=Index(["bar", "foo"], name="lvl1"),
)
tm.assert_frame_equal(result, expected)
result = df.loc[(slice(None), [1, 2]), :]
expected = df.iloc[[0, 1, 3]]
tm.assert_frame_equal(result, expected)
# multi-level series
s = Series(np.arange(len(ix.to_numpy())), index=ix)
result = s.loc["A1":"A3", :, ["C1", "C3"]]
expected = s.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in s.index.values
if a in ("A1", "A2", "A3") and c in ("C1", "C3")
]
]
tm.assert_series_equal(result, expected)
# boolean indexers
result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
expected = df.iloc[[2, 3]]
tm.assert_frame_equal(result, expected)
msg = (
"cannot index with a boolean indexer "
"that is not the same length as the index"
)
with pytest.raises(ValueError, match=msg):
df.loc[(slice(None), np.array([True, False])), :]
with pytest.raises(KeyError, match=r"\[1\] not in index"):
# slice(None) is on the index, [1] is on the columns, but 1 is
# not in the columns, so we raise
# This used to treat [1] as positional GH#16396
df.loc[slice(None), [1]]
# not lexsorted
assert df.index._lexsort_depth == 2
df = df.sort_index(level=1, axis=0)
assert df.index._lexsort_depth == 0
msg = (
"MultiIndex slicing requires the index to be "
r"lexsorted: slicing on levels \[1\], lexsort depth 0"
)
with pytest.raises(UnsortedIndexError, match=msg):
df.loc[(slice(None), slice("bar")), :]
# GH 16734: not sorted, but no real slicing
result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
tm.assert_frame_equal(result, df.iloc[[1, 3], :])
def test_multiindex_slicers_non_unique(self):
# GH 7106
# non-unique mi index support
df = (
DataFrame(
{
"A": ["foo", "foo", "foo", "foo"],
"B": ["a", "a", "a", "a"],
"C": [1, 2, 1, 3],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
assert not df.index.is_unique
expected = (
DataFrame({"A": ["foo", "foo"], "B": ["a", "a"], "C": [1, 1], "D": [1, 3]})
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.loc[(slice(None), slice(None), 1), :]
tm.assert_frame_equal(result, expected)
# this is equivalent of an xs expression
result = df.xs(1, level=2, drop_level=False)
tm.assert_frame_equal(result, expected)
df = (
DataFrame(
{
"A": ["foo", "foo", "foo", "foo"],
"B": ["a", "a", "a", "a"],
"C": [1, 2, 1, 2],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
assert not df.index.is_unique
expected = (
DataFrame({"A": ["foo", "foo"], "B": ["a", "a"], "C": [1, 1], "D": [1, 3]})
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.loc[(slice(None), slice(None), 1), :]
assert not result.index.is_unique
tm.assert_frame_equal(result, expected)
# GH12896
# numpy-implementation dependent bug
ints = [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
12,
13,
14,
14,
16,
17,
18,
19,
200000,
200000,
]
n = len(ints)
idx = MultiIndex.from_arrays([["a"] * n, ints])
result = Series([1] * n, index=idx)
result = result.sort_index()
result = result.loc[(slice(None), slice(100000))]
expected = Series([1] * (n - 2), index=idx[:-2]).sort_index()
tm.assert_series_equal(result, expected)
def test_multiindex_slicers_datetimelike(self):
# GH 7429
# buggy/inconsistent behavior when slicing with datetime-like
dates = [datetime(2012, 1, 1, 12, 12, 12) + timedelta(days=i) for i in range(6)]
freq = [1, 2]
index = MultiIndex.from_product([dates, freq], names=["date", "frequency"])
df = DataFrame(
np.arange(6 * 2 * 4, dtype="int64").reshape(-1, 4),
index=index,
columns=list("ABCD"),
)
# multi-axis slicing
idx = pd.IndexSlice
expected = df.iloc[[0, 2, 4], [0, 1]]
result = df.loc[
(
slice(
Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12")
),
slice(1, 1),
),
slice("A", "B"),
]
tm.assert_frame_equal(result, expected)
result = df.loc[
(
idx[
Timestamp("2012-01-01 12:12:12") : Timestamp("2012-01-03 12:12:12")
],
idx[1:1],
),
slice("A", "B"),
]
tm.assert_frame_equal(result, expected)
result = df.loc[
(
slice(
Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12")
),
1,
),
slice("A", "B"),
]
tm.assert_frame_equal(result, expected)
# with strings
result = df.loc[
(slice("2012-01-01 12:12:12", "2012-01-03 12:12:12"), slice(1, 1)),
slice("A", "B"),
]
tm.assert_frame_equal(result, expected)
result = df.loc[
(idx["2012-01-01 12:12:12":"2012-01-03 12:12:12"], 1), idx["A", "B"]
]
tm.assert_frame_equal(result, expected)
def test_multiindex_slicers_edges(self):
# GH 8132
# various edge cases
df = DataFrame(
{
"A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5,
"B": ["B0", "B0", "B1", "B1", "B2"] * 3,
"DATE": [
"2013-06-11",
"2013-07-02",
"2013-07-09",
"2013-07-30",
"2013-08-06",
"2013-06-11",
"2013-07-02",
"2013-07-09",
"2013-07-30",
"2013-08-06",
"2013-09-03",
"2013-10-01",
"2013-07-09",
"2013-08-06",
"2013-09-03",
],
"VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2],
}
)
df["DATE"] = pd.to_datetime(df["DATE"])
df1 = df.set_index(["A", "B", "DATE"])
df1 = df1.sort_index()
# A1 - Get all values under "A0" and "A1"
result = df1.loc[(slice("A1")), :]
expected = df1.iloc[0:10]
tm.assert_frame_equal(result, expected)
# A2 - Get all values from the start to "A2"
result = df1.loc[(slice("A2")), :]
expected = df1
tm.assert_frame_equal(result, expected)
# A3 - Get all values under "B1" or "B2"
result = df1.loc[(slice(None), slice("B1", "B2")), :]
expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]]
tm.assert_frame_equal(result, expected)
# A4 - Get all values between 2013-07-02 and 2013-07-09
result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :]
expected = df1.iloc[[1, 2, 6, 7, 12]]
tm.assert_frame_equal(result, expected)
# B1 - Get all values in B0 that are also under A0, A1 and A2
result = df1.loc[(slice("A2"), slice("B0")), :]
expected = df1.iloc[[0, 1, 5, 6, 10, 11]]
tm.assert_frame_equal(result, expected)
# B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for
# the As)
result = df1.loc[(slice(None), slice("B2")), :]
expected = df1
tm.assert_frame_equal(result, expected)
# B3 - Get all values from B1 to B2 and up to 2013-08-06
result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :]
expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]]
tm.assert_frame_equal(result, expected)
# B4 - Same as A4 but the start of the date slice is not a key.
# shows indexing on a partial selection slice
result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :]
expected = df1.iloc[[1, 2, 6, 7, 12]]
tm.assert_frame_equal(result, expected)
def test_per_axis_per_level_doc_examples(self):
# test index maker
idx = pd.IndexSlice
# from indexing.rst / advanced
index = MultiIndex.from_product(
[_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)]
)
columns = MultiIndex.from_tuples(
[("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
names=["lvl0", "lvl1"],
)
df = DataFrame(
np.arange(len(index) * len(columns), dtype="int64").reshape(
(len(index), len(columns))
),
index=index,
columns=columns,
)
result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if a in ("A1", "A2", "A3") and c in ("C1", "C3")
]
]
tm.assert_frame_equal(result, expected)
result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :]
tm.assert_frame_equal(result, expected)
result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :]
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if c in ("C1", "C3")
]
]
tm.assert_frame_equal(result, expected)
result = df.loc[idx[:, :, ["C1", "C3"]], :]
tm.assert_frame_equal(result, expected)
# not sorted
msg = (
"MultiIndex slicing requires the index to be lexsorted: "
r"slicing on levels \[1\], lexsort depth 1"
)
with pytest.raises(UnsortedIndexError, match=msg):
df.loc["A1", ("a", slice("foo"))]
# GH 16734: not sorted, but no real slicing
tm.assert_frame_equal(
df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]]
)
df = df.sort_index(axis=1)
# slicing
df.loc["A1", (slice(None), "foo")]
df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")]
# setitem
df.loc(axis=0)[:, :, ["C1", "C3"]] = -10
def test_loc_axis_arguments(self):
index = MultiIndex.from_product(
[_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)]
)
columns = MultiIndex.from_tuples(
[("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
names=["lvl0", "lvl1"],
)
df = (
DataFrame(
np.arange(len(index) * len(columns), dtype="int64").reshape(
(len(index), len(columns))
),
index=index,
columns=columns,
)
.sort_index()
.sort_index(axis=1)
)
# axis 0
result = df.loc(axis=0)["A1":"A3", :, ["C1", "C3"]]
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if a in ("A1", "A2", "A3") and c in ("C1", "C3")
]
]
tm.assert_frame_equal(result, expected)
result = df.loc(axis="index")[:, :, ["C1", "C3"]]
expected = df.loc[
[
(
a,
b,
c,
d,
)
for a, b, c, d in df.index.values
if c in ("C1", "C3")
]
]
tm.assert_frame_equal(result, expected)
# axis 1
result = df.loc(axis=1)[:, "foo"]
expected = df.loc[:, (slice(None), "foo")]
tm.assert_frame_equal(result, expected)
result = df.loc(axis="columns")[:, "foo"]
expected = df.loc[:, (slice(None), "foo")]
tm.assert_frame_equal(result, expected)
# invalid axis
for i in [-1, 2, "foo"]:
msg = f"No axis named {i} for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.loc(axis=i)[:, :, ["C1", "C3"]]
def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self):
# GH29519
df = DataFrame(
np.arange(27).reshape(3, 9),
columns=MultiIndex.from_product([["a1", "a2", "a3"], ["b1", "b2", "b3"]]),
)
result = df.loc(axis=1)["a1":"a2"]
expected = df.iloc[:, :-3]
tm.assert_frame_equal(result, expected)
def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self):
# GH29519
df = DataFrame(
np.arange(27).reshape(3, 9),
columns=MultiIndex.from_product([["a1", "a2", "a3"], ["b1", "b2", "b3"]]),
)
result = df.loc(axis=1)["a1"]
expected = df.iloc[:, :3]
expected.columns = ["b1", "b2", "b3"]
tm.assert_frame_equal(result, expected)
def test_loc_ax_single_level_indexer_simple_df(self):
# GH29519
# test single level indexing on single index column data frame
df = DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"])
result = df.loc(axis=1)["a"]
expected = Series(np.array([0, 3, 6]), name="a")
tm.assert_series_equal(result, expected)
def test_per_axis_per_level_setitem(self):
# test index maker
idx = pd.IndexSlice
# test multi-index slicing with per axis and per index controls
index = MultiIndex.from_tuples(
[("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]
)
columns = MultiIndex.from_tuples(
[("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
names=["lvl0", "lvl1"],
)
df_orig = DataFrame(
np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns
)
df_orig = df_orig.sort_index(axis=0).sort_index(axis=1)
# identity
df = df_orig.copy()
df.loc[(slice(None), slice(None)), :] = 100
expected = df_orig.copy()
expected.iloc[:, :] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc(axis=0)[:, :] = 100
expected = df_orig.copy()
expected.iloc[:, :] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100
expected = df_orig.copy()
expected.iloc[:, :] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[:, (slice(None), slice(None))] = 100
expected = df_orig.copy()
expected.iloc[:, :] = 100
tm.assert_frame_equal(df, expected)
# index
df = df_orig.copy()
df.loc[(slice(None), [1]), :] = 100
expected = df_orig.copy()
expected.iloc[[0, 3]] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[(slice(None), 1), :] = 100
expected = df_orig.copy()
expected.iloc[[0, 3]] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc(axis=0)[:, 1] = 100
expected = df_orig.copy()
expected.iloc[[0, 3]] = 100
tm.assert_frame_equal(df, expected)
# columns
df = df_orig.copy()
df.loc[:, (slice(None), ["foo"])] = 100
expected = df_orig.copy()
expected.iloc[:, [1, 3]] = 100
tm.assert_frame_equal(df, expected)
# both
df = df_orig.copy()
df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[idx[:, 1], idx[:, ["foo"]]] = 100
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] = 100
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc["A", "a"] = 100
expected = df_orig.copy()
expected.iloc[0:3, 0:2] = 100
tm.assert_frame_equal(df, expected)
# setting with a list-like
df = df_orig.copy()
df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array(
[[100, 100], [100, 100]], dtype="int64"
)
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] = 100
tm.assert_frame_equal(df, expected)
# not enough values
df = df_orig.copy()
msg = "setting an array element with a sequence."
with pytest.raises(ValueError, match=msg):
df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array(
[[100], [100, 100]], dtype="int64"
)
msg = "Must have equal len keys and value when setting with an iterable"
with pytest.raises(ValueError, match=msg):
df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array(
[100, 100, 100, 100], dtype="int64"
)
# with an alignable rhs
df = df_orig.copy()
df.loc[(slice(None), 1), (slice(None), ["foo"])] = (
df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5
)
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[
(slice(None), 1), (slice(None), ["foo"])
]
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
tm.assert_frame_equal(df, expected)
rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy()
rhs.loc[:, ("c", "bah")] = 10
df = df_orig.copy()
df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs
expected = df_orig.copy()
expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
tm.assert_frame_equal(df, expected)
def test_multiindex_label_slicing_with_negative_step(self):
ser = Series(
np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)])
)
SLC = pd.IndexSlice
tm.assert_indexing_slices_equivalent(ser, SLC[::-1], SLC[::-1])
tm.assert_indexing_slices_equivalent(ser, SLC["d"::-1], SLC[15::-1])
tm.assert_indexing_slices_equivalent(ser, SLC[("d",)::-1], SLC[15::-1])
tm.assert_indexing_slices_equivalent(ser, SLC[:"d":-1], SLC[:11:-1])
tm.assert_indexing_slices_equivalent(ser, SLC[:("d",):-1], SLC[:11:-1])
tm.assert_indexing_slices_equivalent(ser, SLC["d":"b":-1], SLC[15:3:-1])
tm.assert_indexing_slices_equivalent(ser, SLC[("d",):"b":-1], SLC[15:3:-1])
tm.assert_indexing_slices_equivalent(ser, SLC["d":("b",):-1], SLC[15:3:-1])
tm.assert_indexing_slices_equivalent(ser, SLC[("d",):("b",):-1], SLC[15:3:-1])
tm.assert_indexing_slices_equivalent(ser, SLC["b":"d":-1], SLC[:0])
tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2)::-1], SLC[10::-1])
tm.assert_indexing_slices_equivalent(ser, SLC[:("c", 2):-1], SLC[:9:-1])
tm.assert_indexing_slices_equivalent(
ser, SLC[("e", 0):("c", 2):-1], SLC[16:9:-1]
)
def test_multiindex_slice_first_level(self):
# GH 12697
freq = ["a", "b", "c", "d"]
idx = MultiIndex.from_product([freq, range(500)])
df = DataFrame(list(range(2000)), index=idx, columns=["Test"])
df_slice = df.loc[pd.IndexSlice[:, 30:70], :]
result = df_slice.loc["a"]
expected = DataFrame(list(range(30, 71)), columns=["Test"], index=range(30, 71))
tm.assert_frame_equal(result, expected)
result = df_slice.loc["d"]
expected = DataFrame(
list(range(1530, 1571)), columns=["Test"], index=range(30, 71)
)
tm.assert_frame_equal(result, expected)
def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_data):
ymd = multiindex_year_month_day_dataframe_random_data
s = ymd["A"]
result = s[5:]
expected = s.reindex(s.index[5:])
tm.assert_series_equal(result, expected)
s = ymd["A"].copy()
exp = ymd["A"].copy()
s[5:] = 0
exp.iloc[5:] = 0
tm.assert_numpy_array_equal(s.values, exp.values)
result = ymd[5:]
expected = ymd.reindex(s.index[5:])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype, loc, iloc",
[
# dtype = int, step = -1
("int", slice(None, None, -1), slice(None, None, -1)),
("int", slice(3, None, -1), slice(3, None, -1)),
("int", slice(None, 1, -1), slice(None, 0, -1)),
("int", slice(3, 1, -1), slice(3, 0, -1)),
# dtype = int, step = -2
("int", slice(None, None, -2), slice(None, None, -2)),
("int", slice(3, None, -2), slice(3, None, -2)),
("int", slice(None, 1, -2), slice(None, 0, -2)),
("int", slice(3, 1, -2), slice(3, 0, -2)),
# dtype = str, step = -1
("str", slice(None, None, -1), slice(None, None, -1)),
("str", slice("d", None, -1), slice(3, None, -1)),
("str", slice(None, "b", -1), slice(None, 0, -1)),
("str", slice("d", "b", -1), slice(3, 0, -1)),
# dtype = str, step = -2
("str", slice(None, None, -2), slice(None, None, -2)),
("str", slice("d", None, -2), slice(3, None, -2)),
("str", slice(None, "b", -2), slice(None, 0, -2)),
("str", slice("d", "b", -2), slice(3, 0, -2)),
],
)
def test_loc_slice_negative_stepsize(self, dtype, loc, iloc):
# GH#38071
labels = {
"str": list("abcde"),
"int": range(5),
}[dtype]
mi = MultiIndex.from_arrays([labels] * 2)
df = DataFrame(1.0, index=mi, columns=["A"])
SLC = pd.IndexSlice
expected = df.iloc[iloc, :]
result_get_loc = df.loc[SLC[loc], :]
result_get_locs_level_0 = df.loc[SLC[loc, :], :]
result_get_locs_level_1 = df.loc[SLC[:, loc], :]
tm.assert_frame_equal(result_get_loc, expected)
tm.assert_frame_equal(result_get_locs_level_0, expected)
tm.assert_frame_equal(result_get_locs_level_1, expected)

View File

@@ -0,0 +1,153 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
MultiIndex,
Series,
array,
)
import pandas._testing as tm
class TestMultiIndexSorted:
def test_getitem_multilevel_index_tuple_not_sorted(self):
index_columns = list("abc")
df = DataFrame(
[[0, 1, 0, "x"], [0, 0, 1, "y"]], columns=index_columns + ["data"]
)
df = df.set_index(index_columns)
query_index = df.index[:1]
rs = df.loc[query_index, "data"]
xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=["a", "b", "c"])
xp = Series(["x"], index=xp_idx, name="data")
tm.assert_series_equal(rs, xp)
def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
df = frame.sort_index(level=1).T
# buglet with int typechecking
result = df.iloc[:, : np.int32(3)]
expected = df.reindex(columns=df.columns[:3])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("key", [None, lambda x: x])
def test_frame_getitem_not_sorted2(self, key):
# 13431
df = DataFrame(
{
"col1": ["b", "d", "b", "a"],
"col2": [3, 1, 1, 2],
"data": ["one", "two", "three", "four"],
}
)
df2 = df.set_index(["col1", "col2"])
df2_original = df2.copy()
df2.index = df2.index.set_levels(["b", "d", "a"], level="col1")
df2.index = df2.index.set_codes([0, 1, 0, 2], level="col1")
assert not df2.index.is_monotonic_increasing
assert df2_original.index.equals(df2.index)
expected = df2.sort_index(key=key)
assert expected.index.is_monotonic_increasing
result = df2.sort_index(level=0, key=key)
assert result.index.is_monotonic_increasing
tm.assert_frame_equal(result, expected)
def test_sort_values_key(self):
arrays = [
["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
index = index.sort_values( # sort by third letter
key=lambda x: x.map(lambda entry: entry[2])
)
result = DataFrame(range(8), index=index)
arrays = [
["foo", "foo", "bar", "bar", "qux", "qux", "baz", "baz"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
expected = DataFrame(range(8), index=index)
tm.assert_frame_equal(result, expected)
def test_argsort_with_na(self):
# GH48495
arrays = [
array([2, NA, 1], dtype="Int64"),
array([1, 2, 3], dtype="Int64"),
]
index = MultiIndex.from_arrays(arrays)
result = index.argsort()
expected = np.array([2, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_sort_values_with_na(self):
# GH48495
arrays = [
array([2, NA, 1], dtype="Int64"),
array([1, 2, 3], dtype="Int64"),
]
index = MultiIndex.from_arrays(arrays)
index = index.sort_values()
result = DataFrame(range(3), index=index)
arrays = [
array([1, 2, NA], dtype="Int64"),
array([3, 1, 2], dtype="Int64"),
]
index = MultiIndex.from_arrays(arrays)
expected = DataFrame(range(3), index=index)
tm.assert_frame_equal(result, expected)
def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
df = frame.T
df["foo", "four"] = "foo"
arrays = [np.array(x) for x in zip(*df.columns.values)]
result = df["foo"]
result2 = df.loc[:, "foo"]
expected = df.reindex(columns=df.columns[arrays[0] == "foo"])
expected.columns = expected.columns.droplevel(0)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
df = df.T
result = df.xs("foo")
result2 = df.loc["foo"]
expected = df.reindex(df.index[arrays[0] == "foo"])
expected.index = expected.index.droplevel(0)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
def test_series_getitem_not_sorted(self):
arrays = [
["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
s = Series(np.random.default_rng(2).standard_normal(8), index=index)
arrays = [np.array(x) for x in zip(*index.values)]
result = s["qux"]
result2 = s.loc["qux"]
expected = s[arrays[0] == "qux"]
expected.index = expected.index.droplevel(0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)