#116 introduced a bug that I don't fully understand yet: matvec() on a split matrix doesn't work correctly.
This is fine (n_rows=5):
from tabmat import from_pandas
import numpy as np
import pandas as pd
n_rows = 5
n_cols = 2
np.random.seed(1234)
categories = [f"cat_{val}" for val in range(5)]
X = pd.DataFrame(np.random.choice(categories, size=(n_rows, n_cols))).astype(
"category"
)
X.columns = [str(col) for col in X.columns]
from_pandas(X).matvec(np.array(7 * [1]))
# array([2., 2., 2., 2., 2.])
This isn't (n_rows=7):
from tabmat import from_pandas
import numpy as np
import pandas as pd
n_rows = 10
n_cols = 2
np.random.seed(1234)
categories = [f"cat_{val}" for val in range(5)]
X = pd.DataFrame(np.random.choice(categories, size=(n_rows, n_cols))).astype(
"category"
)
X.columns = [str(col) for col in X.columns]
from_pandas(X).matvec(np.array(9 * [1]))
# array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
This causes unpleasant issues in glum:
from glum import GeneralizedLinearRegressor
import numpy as np
import pandas as pd
n_rows = 10
n_cols = 2
categories = [f"cat_{val}" for val in range(5)]
np.random.seed(1234) # change the seed to 12345 and it will work
X = pd.DataFrame(np.random.choice(categories, size=(n_rows, n_cols))).astype(
"category"
)
X.columns = [str(col) for col in X.columns]
y = np.random.random(size=n_rows)
pd.concat([
pd.Series(GeneralizedLinearRegressor(alpha=10).fit(X, y).coef_),
pd.Series(GeneralizedLinearRegressor(alpha=10).fit(pd.get_dummies(X), y).coef_)
], axis=1)
# Out[1]:
# 0 1
# 0 -0.000985 -0.003941
# 1 -0.000929 -0.003715
# 2 0.001280 0.005120
# 3 0.000653 0.002611
# 4 -0.000019 -0.000075
# 5 0.000189 0.000754
# 6 -0.000549 -0.002195
# 7 0.000547 0.002189
# 8 -0.000187 -0.000748
When I revert #116, the issue disappears.
#116 introduced a bug that I don't fully understand yet:
matvec()on a split matrix doesn't work correctly.This is fine (
n_rows=5):This isn't (
n_rows=7):This causes unpleasant issues in
glum:When I revert #116, the issue disappears.