Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions .github/workflows/tests-win.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
name: CI
on:
push:
paths:
- '.github/**'
- '**/*.pyx'
- '**/*.cpp'
on: push
jobs:
windows-ci:
name: "Win - tests - Python ${{ matrix.PYTHON_VERSION }}"
Expand Down
32 changes: 28 additions & 4 deletions src/quantcore/matrix/categorical_matrix.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Tuple, Union
from typing import Any, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -7,6 +7,7 @@
from .ext.categorical import matvec, sandwich_categorical, transpose_matvec
from .ext.split import sandwich_cat_cat, sandwich_cat_dense
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .util import (
check_matvec_out_shape,
check_transpose_matvec_out_shape,
Expand All @@ -15,6 +16,23 @@
)


def _is_indexer_full_length(full_length: int, indexer: Any):
if isinstance(indexer, int):
return full_length == 1
elif isinstance(indexer, list):
if (np.asarray(indexer) > full_length - 1).any():
raise IndexError("Index out-of-range.")
return len(set(indexer)) == full_length
elif isinstance(indexer, np.ndarray):
if (indexer > full_length - 1).any():
raise IndexError("Index out-of-range.")
return len(np.unique(indexer)) == full_length
elif isinstance(indexer, slice):
return len(range(*indexer.indices(full_length))) == full_length
else:
raise ValueError(f"Indexing with {type(indexer)} is not allowed.")


def _none_to_slice(arr: Optional[np.ndarray], n: int) -> Union[slice, np.ndarray]:
if arr is None or len(arr) == n:
return slice(None, None, None)
Expand Down Expand Up @@ -62,7 +80,7 @@ def _matvec_setup(
other: Union[List, np.ndarray],
cols: np.ndarray = None,
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
other = np.asarray(other)
other = np.squeeze(np.asarray(other))
if other.ndim > 1:
raise NotImplementedError(
"""CategoricalMatrix.matvec is only implemented for 1d arrays."""
Expand Down Expand Up @@ -262,8 +280,14 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray
def __getitem__(self, item):
if isinstance(item, tuple):
row, col = item
if not (isinstance(col, slice) and col == slice(None, None, None)):
raise IndexError("Only column indexing is supported.")
if _is_indexer_full_length(self.shape[1], col):
if isinstance(row, int):
row = [row]
return CategoricalMatrix(self.cat[row])
else:
# return a SparseMatrix if we subset columns
# TODO: this is inefficient. See issue #101.
return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is quite inefficient because we construct the full sparse matrix with self.tocsr() before subsetting it. I'm fine leaving it like this for now, but I think it'd be good to at least leave a TODO comment or add an issue mentioning this performance bug. Fixing this for the single element case is quite easy. For the [:, cols] case, I guess we need to construct a sparse matrix element by element and I'm guessing that it'll be easiest to do that in Cython.

else:
row = item
if isinstance(row, int):
Expand Down
18 changes: 18 additions & 0 deletions src/quantcore/matrix/dense_matrix.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Sequence
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -36,6 +37,13 @@ def __new__(cls, input_array): # noqa
obj = np.asarray(input_array).view(cls)
if not np.issubdtype(obj.dtype, np.floating):
raise NotImplementedError("DenseMatrix is only implemented for float data")
if obj.ndim == 1:
# when input is one-dimensional, convert to a N x 1 matrix
obj = obj[:, np.newaxis]
elif obj.ndim > 2:
raise ValueError(
"DenseMatrix only accepts inputs with at most two dimensions"
)
return obj

def __array_finalize__(self, obj):
Expand Down Expand Up @@ -151,3 +159,13 @@ def matvec(
"""Perform self[:, cols] @ other."""
check_matvec_out_shape(self, out)
return self._matvec_helper(vec, None, cols, out, False)

def __getitem__(self, item):
if isinstance(item, tuple) and len(item) == 2:
row, col = item
if isinstance(row, (Sequence, np.ndarray)) and isinstance(
col, (Sequence, np.ndarray)
):
return super().__getitem__(np.ix_(row, col))

return super().__getitem__(item)
11 changes: 11 additions & 0 deletions src/quantcore/matrix/sparse_matrix.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Sequence
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -182,3 +183,13 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray
def astype(self, dtype, order="K", casting="unsafe", copy=True):
"""Return SparseMatrix cast to new type."""
return super(SparseMatrix, self).astype(dtype, casting, copy)

def __getitem__(self, item):
if isinstance(item, tuple) and len(item) == 2:
row, col = item
if isinstance(row, (Sequence, np.ndarray)) and isinstance(
col, (Sequence, np.ndarray)
):
return super().__getitem__(np.ix_(row, col))

return super().__getitem__(item)
118 changes: 96 additions & 22 deletions src/quantcore/matrix/split_matrix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from typing import List, Optional, Tuple, Union
from typing import Any, List, Optional, Tuple, Union

import numpy as np
from scipy import sparse as sps
Expand All @@ -9,13 +9,32 @@
from .ext.split import is_sorted, split_col_subsets
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .standardized_mat import StandardizedMatrix
from .util import (
check_matvec_out_shape,
check_transpose_matvec_out_shape,
set_up_rows_or_cols,
)


def as_mx(a: Any):
"""Convert an array to a corresponding MatrixBase type.

If the input is already a MatrixBase, return untouched.
If the input is sparse, return a SparseMatrix.
If the input is a numpy array, return a DenseMatrix.
Raise an error is input is another type.
"""
if isinstance(a, (MatrixBase, StandardizedMatrix)):
return a
elif sps.issparse(a):
return SparseMatrix(a)
elif isinstance(a, np.ndarray):
return DenseMatrix(a)
else:
raise ValueError(f"Cannot convert type {type(a)} to Matrix.")


def split_sparse_and_dense_parts(
arg1: sps.csc_matrix, threshold: float = 0.1
) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
Expand Down Expand Up @@ -126,38 +145,45 @@ def __init__(
matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]],
indices: Optional[List[np.ndarray]] = None,
):
flatten_matrices = []
# First check that all matrices are valid types
for _, mat in enumerate(matrices):
for mat in matrices:
if not isinstance(mat, MatrixBase):
raise ValueError(
"Expected all elements of matrices to be subclasses of MatrixBase."
)
if isinstance(mat, SplitMatrix):
raise ValueError("Elements of matrices cannot be SplitMatrix.")
# Flatten out the SplitMatrix
for imat in mat.matrices:
flatten_matrices.append(imat)
else:
flatten_matrices.append(mat)

# Now that we know these are all MatrixBase, we can check consistent
# shapes and dtypes.
self.dtype = matrices[0].dtype
n_row = matrices[0].shape[0]
for i, mat in enumerate(matrices):
self.dtype = flatten_matrices[0].dtype
n_row = flatten_matrices[0].shape[0]
for i, mat in enumerate(flatten_matrices):
if mat.dtype != self.dtype:
warnings.warn(
"Matrices do not all have the same dtype. Dtypes are "
f"{[elt.dtype for elt in matrices]}."
f"{[elt.dtype for elt in flatten_matrices]}."
)
if not mat.shape[0] == n_row:
raise ValueError(
"All matrices should have the same first dimension, "
f"but the first matrix has first dimension {n_row} and matrix {i} has "
f"first dimension {mat.shape[0]}."
)
if len(mat.shape) != 2:
raise ValueError("All matrices should be two dimensional.")
if mat.ndim == 1:
flatten_matrices[i] = mat[:, np.newaxis]
elif mat.ndim > 2:
raise ValueError("All matrices should be at most two dimensional.")

if indices is None:
indices = []
current_idx = 0
for mat in matrices:
for mat in flatten_matrices:
indices.append(
np.arange(current_idx, current_idx + mat.shape[1], dtype=np.int64)
)
Expand All @@ -183,14 +209,14 @@ def __init__(

assert isinstance(indices, list)

for i, (mat, idx) in enumerate(zip(matrices, indices)):
for i, (mat, idx) in enumerate(zip(flatten_matrices, indices)):
if not mat.shape[1] == len(idx):
raise ValueError(
f"Element {i} of indices should should have length {mat.shape[1]}, "
f"but it has shape {idx.shape}"
)

filtered_mats, filtered_idxs = _filter_out_empty(matrices, indices)
filtered_mats, filtered_idxs = _filter_out_empty(flatten_matrices, indices)
combined_matrices, combined_indices = _combine_matrices(
filtered_mats, filtered_idxs
)
Expand All @@ -201,6 +227,31 @@ def __init__(

assert self.shape[1] > 0

self.column_map = self._create_col_mapping()

def _create_col_mapping(self):
Comment thread
MarcAntoineSchmidtQC marked this conversation as resolved.
"""Map from the global index in a SplitMatrix to the local position.

For example, if the column in position 4 of a SplitMatrix is located at
position 1 of the second component of the SplitMatrix, the resulting
array will have [2, 1] located at position 4 of the mapping.
"""
colmap = np.empty(shape=(self.shape[1], 2), dtype=int)
for i, comp_idx in enumerate(self.indices):
for j, idx in enumerate(comp_idx):
colmap[idx] = [i, j]
return colmap

def _split_col_subsets_unordered(self, cols):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function seems occasionally to return empty lists when one indexes columns with a list, which causes is_sorted to throw an error later.

MRE:

import pandas as pd
import quantcore.matrix as mx

df = pd.DataFrame({"u": ["a", "b"], "v": ["a", "b"]})
X = mx.from_pandas(df, cat_threshold=False, object_as_cat=True)
X[:, [0, 1]]

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't replicate this on macOS. I assume you tried this on Windows. Is that correct? Can you give me some info on your setup?

Also, you said "occasionally". Does it always throw an error when you try your MRE or not?

Copy link
Copy Markdown
Member

@lbittarello Luca Bittarello (lbittarello) Jul 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it always throw an error when you try your MRE or not?

Yes. But it doesn't throw an error if we reduce the number of rows from two to one, for example.

I assume you tried this on Windows. Is that correct?

Yes. I'm not embarrassed.

Can you give me some info on your setup?

python           : 3.8.3.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19041
machine          : AMD64
processor        : AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD
byteorder        : little
pandas           : 1.3.0
numpy            : 1.20.3

Is this helpful? Do you need something else?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we perhaps add unit tests for indexing? The Windows CI could come in handy.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, unit testing will be very valuable. I'll temporarily add the Windows CI to this PR for all the pushes. And yes, this is helpful.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 on indexing unit tests.

n_comp = len(self.indices)
new_idx = [[] for _ in range(n_comp)]
current_col_loc = [[] for _ in range(n_comp)]
for counter, i in enumerate(cols):
comp, idx = self.column_map[i]
new_idx[comp].append(counter)
current_col_loc[comp].append(idx)
return new_idx, current_col_loc

def _split_col_subsets(
self, cols: Optional[np.ndarray]
) -> Tuple[List[np.ndarray], List[Optional[np.ndarray]], int]:
Expand Down Expand Up @@ -250,11 +301,8 @@ def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]:
"""Return matrix column at specified index."""
# wrap-around indexing
i %= self.shape[1]
for mat, idx in zip(self.matrices, self.indices):
if i in idx:
loc = np.where(idx == i)[0][0]
return mat.getcol(loc)
raise RuntimeError(f"Column {i} was not found.")
comp, idx = self.column_map[i]
return self.matrices[comp].getcol(idx)

def sandwich(
self,
Expand Down Expand Up @@ -313,7 +361,7 @@ def matvec(
assert not isinstance(v, sps.spmatrix)
check_matvec_out_shape(self, out)

v = np.asarray(v)
v = np.squeeze(np.asarray(v))
if v.shape[0] != self.shape[1]:
raise ValueError(f"shapes {self.shape} and {v.shape} not aligned")

Expand Down Expand Up @@ -371,20 +419,46 @@ def __getitem__(self, key):
row = key
col = slice(None, None, None) # all columns

if col == slice(None, None, None):
if np.any(col == slice(None, None, None)):
if isinstance(row, int):
row = [row]

return SplitMatrix([mat[row, :] for mat in self.matrices], self.indices)
else:
raise NotImplementedError(
f"Only row indexing is supported. Index passed was {key}."
if isinstance(col, int):
return self.getcol(col)[row]
elif isinstance(col, list):
col_array = np.asarray(col)
elif isinstance(col, np.ndarray):
col_array = col
elif isinstance(col, slice):
col_array = np.arange(*col.indices(self.shape[1]))
else:
raise ValueError(f"Indexing with {type(col)} is not allowed.")
subset_cols_indices, subset_cols = self._split_col_subsets_unordered(
col_array
)
new_mat = []
new_ind = []
for (
nind,
oind,
mat,
) in zip(subset_cols_indices, subset_cols, self.matrices):
if len(oind) == 0:
continue
new_mat.append(as_mx(mat[row, oind].reshape(-1, len(nind))))
new_ind.append(nind)
filtered_mat, filtered_ind = _filter_out_empty(new_mat, new_ind)
return SplitMatrix(matrices=filtered_mat, indices=filtered_ind)

def __repr__(self):
out = "SplitMatrix:"
for i, mat in enumerate(self.matrices):
out += f"\n\nComponent {i} with type {mat.__class__.__name__}\n" + str(mat)
out += (
f"\n\nComponent {i} with type {mat.__class__.__name__}\n"
+ mat.__repr__()
)
return out

__array_priority__ = 13
13 changes: 11 additions & 2 deletions src/quantcore/matrix/standardized_mat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import numpy as np
from scipy import sparse as sps

from . import MatrixBase, SparseMatrix
from .util import set_up_rows_or_cols, setup_restrictions
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .util import _get_expected_shape, set_up_rows_or_cols, setup_restrictions


class StandardizedMatrix:
Expand Down Expand Up @@ -248,11 +249,19 @@ def __getitem__(self, item):
col = slice(None, None, None)

mat_part = self.mat.__getitem__(item)
if hasattr(mat_part, "reshape"):
expected_shape = _get_expected_shape(self.shape, item)
mat_part = mat_part.reshape(expected_shape)
shift_part = self.shift[col]
mult_part = self.mult
if mult_part is not None:
mult_part = np.atleast_1d(mult_part[col])

if not isinstance(mat_part, MatrixBase):
if mult_part is not None:
mat_part = mat_part * mult_part
return mat_part + shift_part

if isinstance(row, int):
out = mat_part.A
if mult_part is not None:
Expand Down
Loading