Quantco · Marc-Antoine Schmidt (MarcAntoineSchmidtQC) · Jul 22, 2021 · Jul 22, 2021 · Jul 22, 2021 · Jul 24, 2021
diff --git a/.github/workflows/tests-win.yml b/.github/workflows/tests-win.yml
@@ -1,10 +1,5 @@
 name: CI
-on:
-  push:
-    paths:
-      - '.github/**'
-      - '**/*.pyx'
-      - '**/*.cpp'
+on: push
 jobs:
   windows-ci:
     name: "Win - tests - Python ${{ matrix.PYTHON_VERSION }}"

diff --git a/src/quantcore/matrix/categorical_matrix.py b/src/quantcore/matrix/categorical_matrix.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -7,6 +7,7 @@
 from .ext.categorical import matvec, sandwich_categorical, transpose_matvec
 from .ext.split import sandwich_cat_cat, sandwich_cat_dense
 from .matrix_base import MatrixBase
+from .sparse_matrix import SparseMatrix
 from .util import (
     check_matvec_out_shape,
     check_transpose_matvec_out_shape,
@@ -15,6 +16,23 @@
 )
 
 
+def _is_indexer_full_length(full_length: int, indexer: Any):
+    if isinstance(indexer, int):
+        return full_length == 1
+    elif isinstance(indexer, list):
+        if (np.asarray(indexer) > full_length - 1).any():
+            raise IndexError("Index out-of-range.")
+        return len(set(indexer)) == full_length
+    elif isinstance(indexer, np.ndarray):
+        if (indexer > full_length - 1).any():
+            raise IndexError("Index out-of-range.")
+        return len(np.unique(indexer)) == full_length
+    elif isinstance(indexer, slice):
+        return len(range(*indexer.indices(full_length))) == full_length
+    else:
+        raise ValueError(f"Indexing with {type(indexer)} is not allowed.")
+
+
 def _none_to_slice(arr: Optional[np.ndarray], n: int) -> Union[slice, np.ndarray]:
     if arr is None or len(arr) == n:
         return slice(None, None, None)
@@ -62,7 +80,7 @@ def _matvec_setup(
         other: Union[List, np.ndarray],
         cols: np.ndarray = None,
     ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
-        other = np.asarray(other)
+        other = np.squeeze(np.asarray(other))
         if other.ndim > 1:
             raise NotImplementedError(
                 """CategoricalMatrix.matvec is only implemented for 1d arrays."""
@@ -262,8 +280,14 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray
     def __getitem__(self, item):
         if isinstance(item, tuple):
             row, col = item
-            if not (isinstance(col, slice) and col == slice(None, None, None)):
-                raise IndexError("Only column indexing is supported.")
+            if _is_indexer_full_length(self.shape[1], col):
+                if isinstance(row, int):
+                    row = [row]
+                return CategoricalMatrix(self.cat[row])
+            else:
+                # return a SparseMatrix if we subset columns
+                # TODO: this is inefficient. See issue #101.
+                return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype)
         else:
             row = item
         if isinstance(row, int):

diff --git a/src/quantcore/matrix/dense_matrix.py b/src/quantcore/matrix/dense_matrix.py
@@ -1,3 +1,4 @@
+from collections.abc import Sequence
 from typing import List, Optional, Union
 
 import numpy as np
@@ -36,6 +37,13 @@ def __new__(cls, input_array):  # noqa
         obj = np.asarray(input_array).view(cls)
         if not np.issubdtype(obj.dtype, np.floating):
             raise NotImplementedError("DenseMatrix is only implemented for float data")
+        if obj.ndim == 1:
+            # when input is one-dimensional, convert to a N x 1 matrix
+            obj = obj[:, np.newaxis]
+        elif obj.ndim > 2:
+            raise ValueError(
+                "DenseMatrix only accepts inputs with at most two dimensions"
+            )
         return obj
 
     def __array_finalize__(self, obj):
@@ -151,3 +159,13 @@ def matvec(
         """Perform self[:, cols] @ other."""
         check_matvec_out_shape(self, out)
         return self._matvec_helper(vec, None, cols, out, False)
+
+    def __getitem__(self, item):
+        if isinstance(item, tuple) and len(item) == 2:
+            row, col = item
+            if isinstance(row, (Sequence, np.ndarray)) and isinstance(
+                col, (Sequence, np.ndarray)
+            ):
+                return super().__getitem__(np.ix_(row, col))
+
+        return super().__getitem__(item)
diff --git a/src/quantcore/matrix/sparse_matrix.py b/src/quantcore/matrix/sparse_matrix.py
@@ -1,3 +1,4 @@
+from collections.abc import Sequence
 from typing import List, Optional, Union
 
 import numpy as np
@@ -182,3 +183,13 @@ def get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray
     def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Return SparseMatrix cast to new type."""
         return super(SparseMatrix, self).astype(dtype, casting, copy)
+
+    def __getitem__(self, item):
+        if isinstance(item, tuple) and len(item) == 2:
+            row, col = item
+            if isinstance(row, (Sequence, np.ndarray)) and isinstance(
+                col, (Sequence, np.ndarray)
+            ):
+                return super().__getitem__(np.ix_(row, col))
+
+        return super().__getitem__(item)
diff --git a/src/quantcore/matrix/split_matrix.py b/src/quantcore/matrix/split_matrix.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 from scipy import sparse as sps
@@ -9,13 +9,32 @@
 from .ext.split import is_sorted, split_col_subsets
 from .matrix_base import MatrixBase
 from .sparse_matrix import SparseMatrix
+from .standardized_mat import StandardizedMatrix
 from .util import (
     check_matvec_out_shape,
     check_transpose_matvec_out_shape,
     set_up_rows_or_cols,
 )
 
 
+def as_mx(a: Any):
+    """Convert an array to a corresponding MatrixBase type.
+
+    If the input is already a MatrixBase, return untouched.
+    If the input is sparse, return a SparseMatrix.
+    If the input is a numpy array, return a DenseMatrix.
+    Raise an error is input is another type.
+    """
+    if isinstance(a, (MatrixBase, StandardizedMatrix)):
+        return a
+    elif sps.issparse(a):
+        return SparseMatrix(a)
+    elif isinstance(a, np.ndarray):
+        return DenseMatrix(a)
+    else:
+        raise ValueError(f"Cannot convert type {type(a)} to Matrix.")
+
+
 def split_sparse_and_dense_parts(
     arg1: sps.csc_matrix, threshold: float = 0.1
 ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
@@ -126,38 +145,45 @@ def __init__(
         matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]],
         indices: Optional[List[np.ndarray]] = None,
     ):
+        flatten_matrices = []
         # First check that all matrices are valid types
-        for _, mat in enumerate(matrices):
+        for mat in matrices:
             if not isinstance(mat, MatrixBase):
                 raise ValueError(
                     "Expected all elements of matrices to be subclasses of MatrixBase."
                 )
             if isinstance(mat, SplitMatrix):
-                raise ValueError("Elements of matrices cannot be SplitMatrix.")
+                # Flatten out the SplitMatrix
+                for imat in mat.matrices:
+                    flatten_matrices.append(imat)
+            else:
+                flatten_matrices.append(mat)
 
         # Now that we know these are all MatrixBase, we can check consistent
         # shapes and dtypes.
-        self.dtype = matrices[0].dtype
-        n_row = matrices[0].shape[0]
-        for i, mat in enumerate(matrices):
+        self.dtype = flatten_matrices[0].dtype
+        n_row = flatten_matrices[0].shape[0]
+        for i, mat in enumerate(flatten_matrices):
             if mat.dtype != self.dtype:
                 warnings.warn(
                     "Matrices do not all have the same dtype. Dtypes are "
-                    f"{[elt.dtype for elt in matrices]}."
+                    f"{[elt.dtype for elt in flatten_matrices]}."
                 )
             if not mat.shape[0] == n_row:
                 raise ValueError(
                     "All matrices should have the same first dimension, "
                     f"but the first matrix has first dimension {n_row} and matrix {i} has "
                     f"first dimension {mat.shape[0]}."
                 )
-            if len(mat.shape) != 2:
-                raise ValueError("All matrices should be two dimensional.")
+            if mat.ndim == 1:
+                flatten_matrices[i] = mat[:, np.newaxis]
+            elif mat.ndim > 2:
+                raise ValueError("All matrices should be at most two dimensional.")
 
         if indices is None:
             indices = []
             current_idx = 0
-            for mat in matrices:
+            for mat in flatten_matrices:
                 indices.append(
                     np.arange(current_idx, current_idx + mat.shape[1], dtype=np.int64)
                 )
@@ -183,14 +209,14 @@ def __init__(
 
         assert isinstance(indices, list)
 
-        for i, (mat, idx) in enumerate(zip(matrices, indices)):
+        for i, (mat, idx) in enumerate(zip(flatten_matrices, indices)):
             if not mat.shape[1] == len(idx):
                 raise ValueError(
                     f"Element {i} of indices should should have length {mat.shape[1]}, "
                     f"but it has shape {idx.shape}"
                 )
 
-        filtered_mats, filtered_idxs = _filter_out_empty(matrices, indices)
+        filtered_mats, filtered_idxs = _filter_out_empty(flatten_matrices, indices)
         combined_matrices, combined_indices = _combine_matrices(
             filtered_mats, filtered_idxs
         )
@@ -201,6 +227,31 @@ def __init__(
 
         assert self.shape[1] > 0
 
+        self.column_map = self._create_col_mapping()
+
+    def _create_col_mapping(self):
+        """Map from the global index in a SplitMatrix to the local position.
+
+        For example, if the column in position 4 of a SplitMatrix is located at
+        position 1 of the second component of the SplitMatrix, the resulting
+        array will have [2, 1] located at position 4 of the mapping.
+        """
+        colmap = np.empty(shape=(self.shape[1], 2), dtype=int)
+        for i, comp_idx in enumerate(self.indices):
+            for j, idx in enumerate(comp_idx):
+                colmap[idx] = [i, j]
+        return colmap
+
+    def _split_col_subsets_unordered(self, cols):
+        n_comp = len(self.indices)
+        new_idx = [[] for _ in range(n_comp)]
+        current_col_loc = [[] for _ in range(n_comp)]
+        for counter, i in enumerate(cols):
+            comp, idx = self.column_map[i]
+            new_idx[comp].append(counter)
+            current_col_loc[comp].append(idx)
+        return new_idx, current_col_loc
+
     def _split_col_subsets(
         self, cols: Optional[np.ndarray]
     ) -> Tuple[List[np.ndarray], List[Optional[np.ndarray]], int]:
@@ -250,11 +301,8 @@ def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]:
         """Return matrix column at specified index."""
         # wrap-around indexing
         i %= self.shape[1]
-        for mat, idx in zip(self.matrices, self.indices):
-            if i in idx:
-                loc = np.where(idx == i)[0][0]
-                return mat.getcol(loc)
-        raise RuntimeError(f"Column {i} was not found.")
+        comp, idx = self.column_map[i]
+        return self.matrices[comp].getcol(idx)
 
     def sandwich(
         self,
@@ -313,7 +361,7 @@ def matvec(
         assert not isinstance(v, sps.spmatrix)
         check_matvec_out_shape(self, out)
 
-        v = np.asarray(v)
+        v = np.squeeze(np.asarray(v))
         if v.shape[0] != self.shape[1]:
             raise ValueError(f"shapes {self.shape} and {v.shape} not aligned")
 
@@ -371,20 +419,46 @@ def __getitem__(self, key):
             row = key
             col = slice(None, None, None)  # all columns
 
-        if col == slice(None, None, None):
+        if np.any(col == slice(None, None, None)):
             if isinstance(row, int):
                 row = [row]
 
             return SplitMatrix([mat[row, :] for mat in self.matrices], self.indices)
         else:
-            raise NotImplementedError(
-                f"Only row indexing is supported. Index passed was {key}."
+            if isinstance(col, int):
+                return self.getcol(col)[row]
+            elif isinstance(col, list):
+                col_array = np.asarray(col)
+            elif isinstance(col, np.ndarray):
+                col_array = col
+            elif isinstance(col, slice):
+                col_array = np.arange(*col.indices(self.shape[1]))
+            else:
+                raise ValueError(f"Indexing with {type(col)} is not allowed.")
+            subset_cols_indices, subset_cols = self._split_col_subsets_unordered(
+                col_array
             )
+            new_mat = []
+            new_ind = []
+            for (
+                nind,
+                oind,
+                mat,
+            ) in zip(subset_cols_indices, subset_cols, self.matrices):
+                if len(oind) == 0:
+                    continue
+                new_mat.append(as_mx(mat[row, oind].reshape(-1, len(nind))))
+                new_ind.append(nind)
+            filtered_mat, filtered_ind = _filter_out_empty(new_mat, new_ind)
+            return SplitMatrix(matrices=filtered_mat, indices=filtered_ind)
 
     def __repr__(self):
         out = "SplitMatrix:"
         for i, mat in enumerate(self.matrices):
-            out += f"\n\nComponent {i} with type {mat.__class__.__name__}\n" + str(mat)
+            out += (
+                f"\n\nComponent {i} with type {mat.__class__.__name__}\n"
+                + mat.__repr__()
+            )
         return out
 
     __array_priority__ = 13
diff --git a/src/quantcore/matrix/standardized_mat.py b/src/quantcore/matrix/standardized_mat.py
@@ -3,8 +3,9 @@
 import numpy as np
 from scipy import sparse as sps
 
-from . import MatrixBase, SparseMatrix
-from .util import set_up_rows_or_cols, setup_restrictions
+from .matrix_base import MatrixBase
+from .sparse_matrix import SparseMatrix
+from .util import _get_expected_shape, set_up_rows_or_cols, setup_restrictions
 
 
 class StandardizedMatrix:
@@ -248,11 +249,19 @@ def __getitem__(self, item):
             col = slice(None, None, None)
 
         mat_part = self.mat.__getitem__(item)
+        if hasattr(mat_part, "reshape"):
+            expected_shape = _get_expected_shape(self.shape, item)
+            mat_part = mat_part.reshape(expected_shape)
         shift_part = self.shift[col]
         mult_part = self.mult
         if mult_part is not None:
             mult_part = np.atleast_1d(mult_part[col])
 
+        if not isinstance(mat_part, MatrixBase):
+            if mult_part is not None:
+                mat_part = mat_part * mult_part
+            return mat_part + shift_part
+
         if isinstance(row, int):
             out = mat_part.A
             if mult_part is not None: