-
Notifications
You must be signed in to change notification settings - Fork 8
Improvements to SplitMatrix #91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4612909
2c75a4f
9f44f58
831ef6d
85f6dc6
dc7b2a9
f45a69c
8ddb627
42b78eb
cc7afec
0ce8c2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| import warnings | ||
| from typing import List, Optional, Tuple, Union | ||
| from typing import Any, List, Optional, Tuple, Union | ||
|
|
||
| import numpy as np | ||
| from scipy import sparse as sps | ||
|
|
@@ -9,13 +9,32 @@ | |
| from .ext.split import is_sorted, split_col_subsets | ||
| from .matrix_base import MatrixBase | ||
| from .sparse_matrix import SparseMatrix | ||
| from .standardized_mat import StandardizedMatrix | ||
| from .util import ( | ||
| check_matvec_out_shape, | ||
| check_transpose_matvec_out_shape, | ||
| set_up_rows_or_cols, | ||
| ) | ||
|
|
||
|
|
||
| def as_mx(a: Any): | ||
| """Convert an array to a corresponding MatrixBase type. | ||
|
|
||
| If the input is already a MatrixBase, return untouched. | ||
| If the input is sparse, return a SparseMatrix. | ||
| If the input is a numpy array, return a DenseMatrix. | ||
| Raise an error is input is another type. | ||
| """ | ||
| if isinstance(a, (MatrixBase, StandardizedMatrix)): | ||
| return a | ||
| elif sps.issparse(a): | ||
| return SparseMatrix(a) | ||
| elif isinstance(a, np.ndarray): | ||
| return DenseMatrix(a) | ||
| else: | ||
| raise ValueError(f"Cannot convert type {type(a)} to Matrix.") | ||
|
|
||
|
|
||
| def split_sparse_and_dense_parts( | ||
| arg1: sps.csc_matrix, threshold: float = 0.1 | ||
| ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: | ||
|
|
@@ -126,38 +145,45 @@ def __init__( | |
| matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]], | ||
| indices: Optional[List[np.ndarray]] = None, | ||
| ): | ||
| flatten_matrices = [] | ||
| # First check that all matrices are valid types | ||
| for _, mat in enumerate(matrices): | ||
| for mat in matrices: | ||
| if not isinstance(mat, MatrixBase): | ||
| raise ValueError( | ||
| "Expected all elements of matrices to be subclasses of MatrixBase." | ||
| ) | ||
| if isinstance(mat, SplitMatrix): | ||
| raise ValueError("Elements of matrices cannot be SplitMatrix.") | ||
| # Flatten out the SplitMatrix | ||
| for imat in mat.matrices: | ||
| flatten_matrices.append(imat) | ||
| else: | ||
| flatten_matrices.append(mat) | ||
|
|
||
| # Now that we know these are all MatrixBase, we can check consistent | ||
| # shapes and dtypes. | ||
| self.dtype = matrices[0].dtype | ||
| n_row = matrices[0].shape[0] | ||
| for i, mat in enumerate(matrices): | ||
| self.dtype = flatten_matrices[0].dtype | ||
| n_row = flatten_matrices[0].shape[0] | ||
| for i, mat in enumerate(flatten_matrices): | ||
| if mat.dtype != self.dtype: | ||
| warnings.warn( | ||
| "Matrices do not all have the same dtype. Dtypes are " | ||
| f"{[elt.dtype for elt in matrices]}." | ||
| f"{[elt.dtype for elt in flatten_matrices]}." | ||
| ) | ||
| if not mat.shape[0] == n_row: | ||
| raise ValueError( | ||
| "All matrices should have the same first dimension, " | ||
| f"but the first matrix has first dimension {n_row} and matrix {i} has " | ||
| f"first dimension {mat.shape[0]}." | ||
| ) | ||
| if len(mat.shape) != 2: | ||
| raise ValueError("All matrices should be two dimensional.") | ||
| if mat.ndim == 1: | ||
| flatten_matrices[i] = mat[:, np.newaxis] | ||
| elif mat.ndim > 2: | ||
| raise ValueError("All matrices should be at most two dimensional.") | ||
|
|
||
| if indices is None: | ||
| indices = [] | ||
| current_idx = 0 | ||
| for mat in matrices: | ||
| for mat in flatten_matrices: | ||
| indices.append( | ||
| np.arange(current_idx, current_idx + mat.shape[1], dtype=np.int64) | ||
| ) | ||
|
|
@@ -183,14 +209,14 @@ def __init__( | |
|
|
||
| assert isinstance(indices, list) | ||
|
|
||
| for i, (mat, idx) in enumerate(zip(matrices, indices)): | ||
| for i, (mat, idx) in enumerate(zip(flatten_matrices, indices)): | ||
| if not mat.shape[1] == len(idx): | ||
| raise ValueError( | ||
| f"Element {i} of indices should should have length {mat.shape[1]}, " | ||
| f"but it has shape {idx.shape}" | ||
| ) | ||
|
|
||
| filtered_mats, filtered_idxs = _filter_out_empty(matrices, indices) | ||
| filtered_mats, filtered_idxs = _filter_out_empty(flatten_matrices, indices) | ||
| combined_matrices, combined_indices = _combine_matrices( | ||
| filtered_mats, filtered_idxs | ||
| ) | ||
|
|
@@ -201,6 +227,31 @@ def __init__( | |
|
|
||
| assert self.shape[1] > 0 | ||
|
|
||
| self.column_map = self._create_col_mapping() | ||
|
|
||
| def _create_col_mapping(self): | ||
|
MarcAntoineSchmidtQC marked this conversation as resolved.
|
||
| """Map from the global index in a SplitMatrix to the local position. | ||
|
|
||
| For example, if the column in position 4 of a SplitMatrix is located at | ||
| position 1 of the second component of the SplitMatrix, the resulting | ||
| array will have [2, 1] located at position 4 of the mapping. | ||
| """ | ||
| colmap = np.empty(shape=(self.shape[1], 2), dtype=int) | ||
| for i, comp_idx in enumerate(self.indices): | ||
| for j, idx in enumerate(comp_idx): | ||
| colmap[idx] = [i, j] | ||
| return colmap | ||
|
|
||
| def _split_col_subsets_unordered(self, cols): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function seems occasionally to return empty lists when one indexes columns with a list, which causes MRE: import pandas as pd
import quantcore.matrix as mx
df = pd.DataFrame({"u": ["a", "b"], "v": ["a", "b"]})
X = mx.from_pandas(df, cat_threshold=False, object_as_cat=True)
X[:, [0, 1]]
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't replicate this on macOS. I assume you tried this on Windows. Is that correct? Can you give me some info on your setup? Also, you said "occasionally". Does it always throw an error when you try your MRE or not?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. But it doesn't throw an error if we reduce the number of rows from two to one, for example.
Yes. I'm not embarrassed.
Is this helpful? Do you need something else?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we perhaps add unit tests for indexing? The Windows CI could come in handy.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, unit testing will be very valuable. I'll temporarily add the Windows CI to this PR for all the pushes. And yes, this is helpful.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 on indexing unit tests. |
||
| n_comp = len(self.indices) | ||
| new_idx = [[] for _ in range(n_comp)] | ||
| current_col_loc = [[] for _ in range(n_comp)] | ||
| for counter, i in enumerate(cols): | ||
| comp, idx = self.column_map[i] | ||
| new_idx[comp].append(counter) | ||
| current_col_loc[comp].append(idx) | ||
| return new_idx, current_col_loc | ||
|
|
||
| def _split_col_subsets( | ||
| self, cols: Optional[np.ndarray] | ||
| ) -> Tuple[List[np.ndarray], List[Optional[np.ndarray]], int]: | ||
|
|
@@ -250,11 +301,8 @@ def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]: | |
| """Return matrix column at specified index.""" | ||
| # wrap-around indexing | ||
| i %= self.shape[1] | ||
| for mat, idx in zip(self.matrices, self.indices): | ||
| if i in idx: | ||
| loc = np.where(idx == i)[0][0] | ||
| return mat.getcol(loc) | ||
| raise RuntimeError(f"Column {i} was not found.") | ||
| comp, idx = self.column_map[i] | ||
| return self.matrices[comp].getcol(idx) | ||
|
|
||
| def sandwich( | ||
| self, | ||
|
|
@@ -313,7 +361,7 @@ def matvec( | |
| assert not isinstance(v, sps.spmatrix) | ||
| check_matvec_out_shape(self, out) | ||
|
|
||
| v = np.asarray(v) | ||
| v = np.squeeze(np.asarray(v)) | ||
| if v.shape[0] != self.shape[1]: | ||
| raise ValueError(f"shapes {self.shape} and {v.shape} not aligned") | ||
|
|
||
|
|
@@ -371,20 +419,46 @@ def __getitem__(self, key): | |
| row = key | ||
| col = slice(None, None, None) # all columns | ||
|
|
||
| if col == slice(None, None, None): | ||
| if np.any(col == slice(None, None, None)): | ||
| if isinstance(row, int): | ||
| row = [row] | ||
|
|
||
| return SplitMatrix([mat[row, :] for mat in self.matrices], self.indices) | ||
| else: | ||
| raise NotImplementedError( | ||
| f"Only row indexing is supported. Index passed was {key}." | ||
| if isinstance(col, int): | ||
| return self.getcol(col)[row] | ||
| elif isinstance(col, list): | ||
| col_array = np.asarray(col) | ||
| elif isinstance(col, np.ndarray): | ||
| col_array = col | ||
| elif isinstance(col, slice): | ||
| col_array = np.arange(*col.indices(self.shape[1])) | ||
| else: | ||
| raise ValueError(f"Indexing with {type(col)} is not allowed.") | ||
| subset_cols_indices, subset_cols = self._split_col_subsets_unordered( | ||
| col_array | ||
| ) | ||
| new_mat = [] | ||
| new_ind = [] | ||
| for ( | ||
| nind, | ||
| oind, | ||
| mat, | ||
| ) in zip(subset_cols_indices, subset_cols, self.matrices): | ||
| if len(oind) == 0: | ||
| continue | ||
| new_mat.append(as_mx(mat[row, oind].reshape(-1, len(nind)))) | ||
| new_ind.append(nind) | ||
| filtered_mat, filtered_ind = _filter_out_empty(new_mat, new_ind) | ||
| return SplitMatrix(matrices=filtered_mat, indices=filtered_ind) | ||
|
|
||
| def __repr__(self): | ||
| out = "SplitMatrix:" | ||
| for i, mat in enumerate(self.matrices): | ||
| out += f"\n\nComponent {i} with type {mat.__class__.__name__}\n" + str(mat) | ||
| out += ( | ||
| f"\n\nComponent {i} with type {mat.__class__.__name__}\n" | ||
| + mat.__repr__() | ||
| ) | ||
| return out | ||
|
|
||
| __array_priority__ = 13 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is quite inefficient because we construct the full sparse matrix with
self.tocsr()before subsetting it. I'm fine leaving it like this for now, but I think it'd be good to at least leave a TODO comment or add an issue mentioning this performance bug. Fixing this for the single element case is quite easy. For the[:, cols]case, I guess we need to construct a sparse matrix element by element and I'm guessing that it'll be easiest to do that in Cython.