Skip to content

Commit 8bb54ce

Browse files
committed
WIP: ENH: CoW support for replace
1 parent c5b5061 commit 8bb54ce

File tree

4 files changed

+64
-4
lines changed

4 files changed

+64
-4
lines changed

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5522,7 +5522,7 @@ def _replace_columnwise(
55225522
DataFrame or None
55235523
"""
55245524
# Operate column-wise
5525-
res = self if inplace else self.copy()
5525+
res = self if inplace else self.copy(deep=None)
55265526
ax = self.columns
55275527

55285528
for i, ax_value in enumerate(ax):

pandas/core/generic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7045,6 +7045,7 @@ def replace(
70457045
to_replace = [to_replace]
70467046

70477047
if isinstance(to_replace, (tuple, list)):
7048+
# TODO: Consider copy-on-write for non-replaced columns's here
70487049
if isinstance(self, ABCDataFrame):
70497050
from pandas import Series
70507051

@@ -7104,7 +7105,8 @@ def replace(
71047105
if not self.size:
71057106
if inplace:
71067107
return None
7107-
return self.copy()
7108+
# TODO: Is it even worth doing the CoW here?
7109+
return self.copy(deep=None)
71087110

71097111
if is_dict_like(to_replace):
71107112
if is_dict_like(value): # {'A' : NA} -> {'A' : 0}

pandas/core/internals/managers.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,8 +1211,12 @@ def value_getitem(placement):
12111211
return value[placement.indexer]
12121212

12131213
# Accessing public blknos ensures the public versions are initialized
1214+
# print(f"Loc is {loc}")
12141215
blknos = self.blknos[loc]
1216+
# print(f"Blknos is {blknos}")
12151217
blklocs = self.blklocs[loc].copy()
1218+
# print(f"Blklocs is {blklocs}")
1219+
# print(f"Blocks is {self.blocks}")
12161220

12171221
unfit_mgr_locs = []
12181222
unfit_val_locs = []
@@ -1223,8 +1227,32 @@ def value_getitem(placement):
12231227
if inplace and blk.should_store(value):
12241228
# Updating inplace -> check if we need to do Copy-on-Write
12251229
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1226-
blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
1227-
self._clear_reference_block(blkno_l)
1230+
1231+
# print("Hit here")
1232+
# print(f"Blk locs is {blk_locs}")
1233+
# print(f"Blkno_l is {blkno_l}")
1234+
prev_refs = self.refs[blkno_l]
1235+
leftover_blocks = blk.delete(blk_locs)
1236+
num_extra_blocks = len(leftover_blocks)
1237+
# Preserve refs for unchanged blocks
1238+
extra_refs = [prev_refs] * num_extra_blocks
1239+
self.refs = self.refs[:blkno_l] + extra_refs + self.refs[blkno_l:]
1240+
1241+
# TODO: Are we sure we want 2D?
1242+
nb = new_block_2d(value, placement=blk._mgr_locs)
1243+
old_blocks = self.blocks
1244+
new_blocks = (
1245+
old_blocks[:blkno_l]
1246+
+ tuple(leftover_blocks[:blkno_l])
1247+
+ (nb,)
1248+
+ tuple(leftover_blocks[blkno_l:])
1249+
+ old_blocks[blkno_l + num_extra_blocks :]
1250+
)
1251+
self.blocks = new_blocks
1252+
1253+
self._rebuild_blknos_and_blklocs()
1254+
# blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
1255+
# self._clear_reference_block(blkno_l)
12281256
else:
12291257
blk.set_inplace(blk_locs, value_getitem(val_locs))
12301258
else:

pandas/tests/copy_view/test_methods.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,3 +700,33 @@ def test_squeeze(using_copy_on_write):
700700
# Without CoW the original will be modified
701701
assert np.shares_memory(series.values, get_array(df, "a"))
702702
assert df.loc[0, "a"] == 0
703+
704+
705+
@pytest.mark.parametrize(
706+
"to_replace",
707+
[
708+
{"a": 1, "b": 4},
709+
# Test CoW splits blocks to avoid copying unchanged columns
710+
{"a": 1},
711+
# TODO: broken, fix by fixing Block.replace()
712+
# 1
713+
],
714+
)
715+
def test_replace(using_copy_on_write, to_replace):
716+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
717+
df_orig = df.copy()
718+
719+
df_replaced = df.replace(to_replace=to_replace, value=-1)
720+
721+
# Should share memory regardless of CoW since squeeze is just an iloc
722+
if using_copy_on_write:
723+
if (df_replaced["b"] == df["b"]).all():
724+
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
725+
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
726+
727+
# mutating squeezed df triggers a copy-on-write for that column/block
728+
df_replaced.loc[0, "c"] = -1
729+
if using_copy_on_write:
730+
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a"))
731+
732+
tm.assert_frame_equal(df, df_orig)

0 commit comments

Comments
 (0)