diff --git a/.github/workflows/tests-win.yml b/.github/workflows/tests-win.yml index fe03754a..0e06e5e6 100644 --- a/.github/workflows/tests-win.yml +++ b/.github/workflows/tests-win.yml @@ -1,10 +1,5 @@ name: CI -on: - push: - paths: - - '.github/**' - - '**/*.pyx' - - '**/*.cpp' +on: push jobs: windows-ci: name: "Win - tests - Python ${{ matrix.PYTHON_VERSION }}" diff --git a/src/quantcore/matrix/categorical_matrix.py b/src/quantcore/matrix/categorical_matrix.py index 05564a6f..9b099e77 100644 --- a/src/quantcore/matrix/categorical_matrix.py +++ b/src/quantcore/matrix/categorical_matrix.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -7,6 +7,7 @@ from .ext.categorical import matvec, sandwich_categorical, transpose_matvec from .ext.split import sandwich_cat_cat, sandwich_cat_dense from .matrix_base import MatrixBase +from .sparse_matrix import SparseMatrix from .util import ( check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -15,6 +16,23 @@ ) +def _is_indexer_full_length(full_length: int, indexer: Any): + if isinstance(indexer, int): + return full_length == 1 + elif isinstance(indexer, list): + if (np.asarray(indexer) > full_length - 1).any(): + raise IndexError("Index out-of-range.") + return len(set(indexer)) == full_length + elif isinstance(indexer, np.ndarray): + if (indexer > full_length - 1).any(): + raise IndexError("Index out-of-range.") + return len(np.unique(indexer)) == full_length + elif isinstance(indexer, slice): + return len(range(*indexer.indices(full_length))) == full_length + else: + raise ValueError(f"Indexing with {type(indexer)} is not allowed.") + + def _none_to_slice(arr: Optional[np.ndarray], n: int) -> Union[slice, np.ndarray]: if arr is None or len(arr) == n: return slice(None, None, None) @@ -62,7 +80,7 @@ def _matvec_setup( other: Union[List, np.ndarray], cols: np.ndarray = None, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: - other = np.asarray(other) + other = np.squeeze(np.asarray(other)) if other.ndim > 1: raise NotImplementedError( """CategoricalMatrix.matvec is only implemented for 1d arrays.""" @@ -262,8 +280,14 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray def __getitem__(self, item): if isinstance(item, tuple): row, col = item - if not (isinstance(col, slice) and col == slice(None, None, None)): - raise IndexError("Only column indexing is supported.") + if _is_indexer_full_length(self.shape[1], col): + if isinstance(row, int): + row = [row] + return CategoricalMatrix(self.cat[row]) + else: + # return a SparseMatrix if we subset columns + # TODO: this is inefficient. See issue #101. + return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype) else: row = item if isinstance(row, int): diff --git a/src/quantcore/matrix/dense_matrix.py b/src/quantcore/matrix/dense_matrix.py index 61e812ce..b753bd8a 100644 --- a/src/quantcore/matrix/dense_matrix.py +++ b/src/quantcore/matrix/dense_matrix.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence from typing import List, Optional, Union import numpy as np @@ -36,6 +37,13 @@ def __new__(cls, input_array): # noqa obj = np.asarray(input_array).view(cls) if not np.issubdtype(obj.dtype, np.floating): raise NotImplementedError("DenseMatrix is only implemented for float data") + if obj.ndim == 1: + # when input is one-dimensional, convert to a N x 1 matrix + obj = obj[:, np.newaxis] + elif obj.ndim > 2: + raise ValueError( + "DenseMatrix only accepts inputs with at most two dimensions" + ) return obj def __array_finalize__(self, obj): @@ -151,3 +159,13 @@ def matvec( """Perform self[:, cols] @ other.""" check_matvec_out_shape(self, out) return self._matvec_helper(vec, None, cols, out, False) + + def __getitem__(self, item): + if isinstance(item, tuple) and len(item) == 2: + row, col = item + if isinstance(row, (Sequence, np.ndarray)) and isinstance( + col, (Sequence, np.ndarray) + ): + return super().__getitem__(np.ix_(row, col)) + + return super().__getitem__(item) diff --git a/src/quantcore/matrix/sparse_matrix.py b/src/quantcore/matrix/sparse_matrix.py index 261ce5bf..885f2a14 100644 --- a/src/quantcore/matrix/sparse_matrix.py +++ b/src/quantcore/matrix/sparse_matrix.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence from typing import List, Optional, Union import numpy as np @@ -182,3 +183,13 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return SparseMatrix cast to new type.""" return super(SparseMatrix, self).astype(dtype, casting, copy) + + def __getitem__(self, item): + if isinstance(item, tuple) and len(item) == 2: + row, col = item + if isinstance(row, (Sequence, np.ndarray)) and isinstance( + col, (Sequence, np.ndarray) + ): + return super().__getitem__(np.ix_(row, col)) + + return super().__getitem__(item) diff --git a/src/quantcore/matrix/split_matrix.py b/src/quantcore/matrix/split_matrix.py index 22554fa4..e5cf9763 100644 --- a/src/quantcore/matrix/split_matrix.py +++ b/src/quantcore/matrix/split_matrix.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import numpy as np from scipy import sparse as sps @@ -9,6 +9,7 @@ from .ext.split import is_sorted, split_col_subsets from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix +from .standardized_mat import StandardizedMatrix from .util import ( check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -16,6 +17,24 @@ ) +def as_mx(a: Any): + """Convert an array to a corresponding MatrixBase type. + + If the input is already a MatrixBase, return untouched. + If the input is sparse, return a SparseMatrix. + If the input is a numpy array, return a DenseMatrix. + Raise an error is input is another type. + """ + if isinstance(a, (MatrixBase, StandardizedMatrix)): + return a + elif sps.issparse(a): + return SparseMatrix(a) + elif isinstance(a, np.ndarray): + return DenseMatrix(a) + else: + raise ValueError(f"Cannot convert type {type(a)} to Matrix.") + + def split_sparse_and_dense_parts( arg1: sps.csc_matrix, threshold: float = 0.1 ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: @@ -126,24 +145,29 @@ def __init__( matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]], indices: Optional[List[np.ndarray]] = None, ): + flatten_matrices = [] # First check that all matrices are valid types - for _, mat in enumerate(matrices): + for mat in matrices: if not isinstance(mat, MatrixBase): raise ValueError( "Expected all elements of matrices to be subclasses of MatrixBase." ) if isinstance(mat, SplitMatrix): - raise ValueError("Elements of matrices cannot be SplitMatrix.") + # Flatten out the SplitMatrix + for imat in mat.matrices: + flatten_matrices.append(imat) + else: + flatten_matrices.append(mat) # Now that we know these are all MatrixBase, we can check consistent # shapes and dtypes. - self.dtype = matrices[0].dtype - n_row = matrices[0].shape[0] - for i, mat in enumerate(matrices): + self.dtype = flatten_matrices[0].dtype + n_row = flatten_matrices[0].shape[0] + for i, mat in enumerate(flatten_matrices): if mat.dtype != self.dtype: warnings.warn( "Matrices do not all have the same dtype. Dtypes are " - f"{[elt.dtype for elt in matrices]}." + f"{[elt.dtype for elt in flatten_matrices]}." ) if not mat.shape[0] == n_row: raise ValueError( @@ -151,13 +175,15 @@ def __init__( f"but the first matrix has first dimension {n_row} and matrix {i} has " f"first dimension {mat.shape[0]}." ) - if len(mat.shape) != 2: - raise ValueError("All matrices should be two dimensional.") + if mat.ndim == 1: + flatten_matrices[i] = mat[:, np.newaxis] + elif mat.ndim > 2: + raise ValueError("All matrices should be at most two dimensional.") if indices is None: indices = [] current_idx = 0 - for mat in matrices: + for mat in flatten_matrices: indices.append( np.arange(current_idx, current_idx + mat.shape[1], dtype=np.int64) ) @@ -183,14 +209,14 @@ def __init__( assert isinstance(indices, list) - for i, (mat, idx) in enumerate(zip(matrices, indices)): + for i, (mat, idx) in enumerate(zip(flatten_matrices, indices)): if not mat.shape[1] == len(idx): raise ValueError( f"Element {i} of indices should should have length {mat.shape[1]}, " f"but it has shape {idx.shape}" ) - filtered_mats, filtered_idxs = _filter_out_empty(matrices, indices) + filtered_mats, filtered_idxs = _filter_out_empty(flatten_matrices, indices) combined_matrices, combined_indices = _combine_matrices( filtered_mats, filtered_idxs ) @@ -201,6 +227,31 @@ def __init__( assert self.shape[1] > 0 + self.column_map = self._create_col_mapping() + + def _create_col_mapping(self): + """Map from the global index in a SplitMatrix to the local position. + + For example, if the column in position 4 of a SplitMatrix is located at + position 1 of the second component of the SplitMatrix, the resulting + array will have [2, 1] located at position 4 of the mapping. + """ + colmap = np.empty(shape=(self.shape[1], 2), dtype=int) + for i, comp_idx in enumerate(self.indices): + for j, idx in enumerate(comp_idx): + colmap[idx] = [i, j] + return colmap + + def _split_col_subsets_unordered(self, cols): + n_comp = len(self.indices) + new_idx = [[] for _ in range(n_comp)] + current_col_loc = [[] for _ in range(n_comp)] + for counter, i in enumerate(cols): + comp, idx = self.column_map[i] + new_idx[comp].append(counter) + current_col_loc[comp].append(idx) + return new_idx, current_col_loc + def _split_col_subsets( self, cols: Optional[np.ndarray] ) -> Tuple[List[np.ndarray], List[Optional[np.ndarray]], int]: @@ -250,11 +301,8 @@ def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]: """Return matrix column at specified index.""" # wrap-around indexing i %= self.shape[1] - for mat, idx in zip(self.matrices, self.indices): - if i in idx: - loc = np.where(idx == i)[0][0] - return mat.getcol(loc) - raise RuntimeError(f"Column {i} was not found.") + comp, idx = self.column_map[i] + return self.matrices[comp].getcol(idx) def sandwich( self, @@ -313,7 +361,7 @@ def matvec( assert not isinstance(v, sps.spmatrix) check_matvec_out_shape(self, out) - v = np.asarray(v) + v = np.squeeze(np.asarray(v)) if v.shape[0] != self.shape[1]: raise ValueError(f"shapes {self.shape} and {v.shape} not aligned") @@ -371,20 +419,46 @@ def __getitem__(self, key): row = key col = slice(None, None, None) # all columns - if col == slice(None, None, None): + if np.any(col == slice(None, None, None)): if isinstance(row, int): row = [row] return SplitMatrix([mat[row, :] for mat in self.matrices], self.indices) else: - raise NotImplementedError( - f"Only row indexing is supported. Index passed was {key}." + if isinstance(col, int): + return self.getcol(col)[row] + elif isinstance(col, list): + col_array = np.asarray(col) + elif isinstance(col, np.ndarray): + col_array = col + elif isinstance(col, slice): + col_array = np.arange(*col.indices(self.shape[1])) + else: + raise ValueError(f"Indexing with {type(col)} is not allowed.") + subset_cols_indices, subset_cols = self._split_col_subsets_unordered( + col_array ) + new_mat = [] + new_ind = [] + for ( + nind, + oind, + mat, + ) in zip(subset_cols_indices, subset_cols, self.matrices): + if len(oind) == 0: + continue + new_mat.append(as_mx(mat[row, oind].reshape(-1, len(nind)))) + new_ind.append(nind) + filtered_mat, filtered_ind = _filter_out_empty(new_mat, new_ind) + return SplitMatrix(matrices=filtered_mat, indices=filtered_ind) def __repr__(self): out = "SplitMatrix:" for i, mat in enumerate(self.matrices): - out += f"\n\nComponent {i} with type {mat.__class__.__name__}\n" + str(mat) + out += ( + f"\n\nComponent {i} with type {mat.__class__.__name__}\n" + + mat.__repr__() + ) return out __array_priority__ = 13 diff --git a/src/quantcore/matrix/standardized_mat.py b/src/quantcore/matrix/standardized_mat.py index cbc6cbf8..7706a7ac 100644 --- a/src/quantcore/matrix/standardized_mat.py +++ b/src/quantcore/matrix/standardized_mat.py @@ -3,8 +3,9 @@ import numpy as np from scipy import sparse as sps -from . import MatrixBase, SparseMatrix -from .util import set_up_rows_or_cols, setup_restrictions +from .matrix_base import MatrixBase +from .sparse_matrix import SparseMatrix +from .util import _get_expected_shape, set_up_rows_or_cols, setup_restrictions class StandardizedMatrix: @@ -248,11 +249,19 @@ def __getitem__(self, item): col = slice(None, None, None) mat_part = self.mat.__getitem__(item) + if hasattr(mat_part, "reshape"): + expected_shape = _get_expected_shape(self.shape, item) + mat_part = mat_part.reshape(expected_shape) shift_part = self.shift[col] mult_part = self.mult if mult_part is not None: mult_part = np.atleast_1d(mult_part[col]) + if not isinstance(mat_part, MatrixBase): + if mult_part is not None: + mat_part = mat_part * mult_part + return mat_part + shift_part + if isinstance(row, int): out = mat_part.A if mult_part is not None: diff --git a/src/quantcore/matrix/util.py b/src/quantcore/matrix/util.py index 62945ba1..39bb1d1b 100644 --- a/src/quantcore/matrix/util.py +++ b/src/quantcore/matrix/util.py @@ -40,3 +40,27 @@ def check_transpose_matvec_out_shape(mat, out: Optional[np.ndarray]) -> None: def check_matvec_out_shape(mat, out: Optional[np.ndarray]) -> None: """Assert that the first dimension of the matvec output is correct.""" _check_out_shape(out, mat.shape[0]) + + +def _get_expected_axis_length(orig_length, indexer): + if isinstance(indexer, int): + return 1 + if isinstance(indexer, list): + return len(indexer) + elif isinstance(indexer, np.ndarray): + assert indexer.ndim < 2 + return len(indexer) + elif isinstance(indexer, slice): + return len(range(*indexer.indices(orig_length))) + else: + raise ValueError(f"Indexing with {type(indexer)} is not allowed.") + + +def _get_expected_shape(orig_shape, indexer): + if isinstance(indexer, tuple): + row, col = indexer + new_row_shape = _get_expected_axis_length(orig_shape[0], row) + new_col_shape = _get_expected_axis_length(orig_shape[1], col) + return (new_row_shape, new_col_shape) + else: + return (_get_expected_axis_length(orig_shape[0], indexer),) + orig_shape[1:] diff --git a/tests/test_matrices.py b/tests/test_matrices.py index be654dda..1ce76652 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -9,6 +9,13 @@ import quantcore.matrix as mx +def safe_to_array(a): + if hasattr(a, "A"): + return a.A + else: + return a + + def base_array(order="F") -> np.ndarray: return np.array([[0, 0], [0, -1.0], [0, 2.0]], order=order) @@ -100,6 +107,14 @@ def get_matrices(): ) +def get_basic_indices(): + return (1, [0], slice(0, 10, 2), slice(None)) + + +def get_advanced_indices(): + return ([0], [0, 1], np.arange(2)) + + @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize("cols", [None, [], [1], np.array([1])]) def test_matvec_out_parameter_wrong_shape(mat, cols): @@ -237,7 +252,7 @@ def is_split_with_cat_part(x): ) ) - if has_categorical_component and len(shape) > 1: + if has_categorical_component and len(np.squeeze(other).shape) > 1: with pytest.raises(NotImplementedError, match="only implemented for 1d"): mat.matvec(other, cols) else: @@ -246,12 +261,12 @@ def is_split_with_cat_part(x): mat_subset, vec_subset = process_mat_vec_subsets(mat, other, None, cols, cols) expected = mat_subset.dot(vec_subset) - np.testing.assert_allclose(res, expected) + np.testing.assert_allclose(np.squeeze(res), np.squeeze(expected)) assert isinstance(res, np.ndarray) if cols is None: res2 = mat @ other - np.testing.assert_allclose(res2, expected) + np.testing.assert_allclose(np.squeeze(res2), np.squeeze(expected)) def process_mat_vec_subsets(mat, vec, mat_rows, mat_cols, vec_idxs): @@ -513,6 +528,41 @@ def test_standardize( np.testing.assert_allclose(unstandardized.A, asarray) +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("rows", get_basic_indices()) +@pytest.mark.parametrize("cols", get_basic_indices()) +def test_basic_indexing(mat, rows, cols): + # row only + np.testing.assert_allclose( + np.squeeze(mat.A[rows]), np.squeeze(safe_to_array(mat[rows])) + ) + # col and row + np.testing.assert_allclose( + np.squeeze(mat.A[rows, cols]), np.squeeze(safe_to_array(mat[rows, cols])) + ) + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("rows", get_advanced_indices()) +@pytest.mark.parametrize("cols", get_advanced_indices()) +def test_advanced_indexing(mat, rows, cols): + """quantcore.matrix does not support advanced indexing. + + Here, we are testing our multi-sequences indexing (e.g. a[[0, 1], [0, 1]]). + This means that we select the rows 0 and 1 and columns 0 and 1, resulting + in a 2 x 2 matrix. For numpy, this would result is a 1d matrix with 2 elements. + """ + # row only + np.testing.assert_allclose( + np.squeeze(mat.A[rows]), np.squeeze(safe_to_array(mat[rows])) + ) + # col and row + np.testing.assert_allclose( + np.squeeze(mat.A[np.ix_(rows, cols)]), + np.squeeze(safe_to_array(mat[rows, cols])), + ) + + @pytest.mark.parametrize("mat", get_matrices()) def test_indexing_int_row(mat: Union[mx.MatrixBase, mx.StandardizedMatrix]): res = mat[0, :] @@ -577,3 +627,16 @@ def test_pandas_to_matrix(): # was being changed in place. assert df["cl_obj"].dtype == object assert df["ds"].dtype == np.float64 + + +@pytest.mark.parametrize("mat", get_all_matrix_base_subclass_mats()) +def test_split_matrix_creation(mat): + sm = mx.SplitMatrix(matrices=[mat, mat]) + assert sm.shape[0] == mat.shape[0] + assert sm.shape[1] == 2 * mat.shape[1] + + +def test_Luca(): + df = pd.DataFrame({"u": ["a", "b"], "v": ["a", "b"]}) + X = mx.from_pandas(df, cat_threshold=False, object_as_cat=True) + X[:, [0, 1]] diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index dadc75df..14ea76e3 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -227,10 +227,3 @@ def check(mat): np.testing.assert_almost_equal(res, expected) many_random_tests(check) - - -def test_oned_dense_mat(): - m1 = mx.CategoricalMatrix(np.random.randint(0, 10, 10)) - m2 = mx.DenseMatrix(np.random.rand(10)) - with pytest.raises(ValueError): - mx.SplitMatrix([m1, m2])