Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
10 changes: 10 additions & 0 deletions questions/189_mahalanobis-distance/description.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Write a Python function that calculates the Mahalanobis distance between a point and a probability distribution. The function should take a point, the mean vector of the distribution, and the covariance matrix as inputs and return the Mahalanobis distance as a scalar value. The Mahalanobis distance is a measure of how many standard deviations away a point is from the mean, considering the correlations in the data (represented by the covariance matrix).

The function inputs should be in NumPy format (numpy.ndarray):
- `point`: A 1D NumPy array representing the point in the feature space
- `mean`: A 1D NumPy array representing the mean of the distribution
- `cov_matrix`: A 2D NumPy array representing the covariance matrix of the distribution

The function should return a float representing the Mahalanobis distance computed as: $D_M = \sqrt{(x - \mu)^T \Sigma^{-1} (x - \mu)}$

**Note:** The return value should be rounded to 4 decimal places.
5 changes: 5 additions & 0 deletions questions/189_mahalanobis-distance/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"input": "point = np.array([3.0, 4.0, 2.0]),\nmean = np.array([0.0, 0.0, 0.0]),\ncov_matrix = np.array([[1.0, 0.0, 0.0],\n [0.0, 1.0, 0.0],\n [0.0, 0.0, 1.0]])",
"output": "5.744562646538029",
"reasoning": "With an identity covariance matrix, the Mahalanobis distance reduces to the Euclidean distance. The point [3, 4, 2] is at distance sqrt(3^2 + 4^2 + 2^2) = sqrt(9 + 16 + 4) = sqrt(29) ≈ 5.745 from the origin [0, 0, 0]. The covariance matrix being identity means there is no correlation between dimensions and all dimensions have unit variance, so the standardized distance equals the Euclidean distance."
}
21 changes: 21 additions & 0 deletions questions/189_mahalanobis-distance/example.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Example: Mahalanobis Distance

## Example Input

```python
point = np.array([3.0, 4.0, 2.0])
mean = np.array([0.0, 0.0, 0.0])
cov_matrix = np.array([[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[0.0, 0.0, 1.0]])
```

## Example Output

```
5.744562646538029
```

## Reasoning

With an identity covariance matrix, the Mahalanobis distance reduces to the Euclidean distance. The point [3, 4, 2] is at distance sqrt(3^2 + 4^2 + 2^2) = sqrt(9 + 16 + 4) = sqrt(29) ≈ 5.745 from the origin [0, 0, 0]. The covariance matrix being identity means there is no correlation between dimensions and all dimensions have unit variance, so the standardized distance equals the Euclidean distance.
125 changes: 125 additions & 0 deletions questions/189_mahalanobis-distance/generate_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Generate test cases for the Mahalanobis distance problem.
This script creates comprehensive test cases and writes them to tests.json.
"""
import json
import numpy as np
from pathlib import Path


def mahalanobis_distance_reference(point, mean, cov_matrix):
"""Reference implementation for generating expected outputs."""
diff = point - mean
try:
inv_cov = np.linalg.inv(cov_matrix)
except np.linalg.LinAlgError:
inv_cov = np.linalg.pinv(cov_matrix)
distance = np.sqrt(np.dot(np.dot(diff, inv_cov), diff))
return float(distance)


def generate_tests():
"""Generate diverse test cases for Mahalanobis distance."""
tests = []

# Test 1: Identity covariance (Euclidean distance)
point1 = np.array([3.0, 4.0])
mean1 = np.array([0.0, 0.0])
cov1 = np.array([[1.0, 0.0], [0.0, 1.0]])
expected1 = round(mahalanobis_distance_reference(point1, mean1, cov1), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([3.0, 4.0]), np.array([0.0, 0.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))",
"expected_output": str(expected1)
})

# Test 2: Scaled identity covariance (2D)
point2 = np.array([2.0, 2.0])
mean2 = np.array([0.0, 0.0])
cov2 = np.array([[4.0, 0.0], [0.0, 1.0]])
expected2 = round(mahalanobis_distance_reference(point2, mean2, cov2), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([2.0, 2.0]), np.array([0.0, 0.0]), np.array([[4.0, 0.0], [0.0, 1.0]])))",
"expected_output": str(expected2)
})

# Test 3: Non-diagonal covariance (with correlation)
point3 = np.array([1.0, 1.0])
mean3 = np.array([0.0, 0.0])
cov3 = np.array([[2.0, 1.0], [1.0, 2.0]])
expected3 = round(mahalanobis_distance_reference(point3, mean3, cov3), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([1.0, 1.0]), np.array([0.0, 0.0]), np.array([[2.0, 1.0], [1.0, 2.0]])))",
"expected_output": str(expected3)
})

# Test 4: 3D case with non-zero mean
point4 = np.array([5.0, 3.0, 2.0])
mean4 = np.array([1.0, 1.0, 1.0])
cov4 = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
expected4 = round(mahalanobis_distance_reference(point4, mean4, cov4), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([5.0, 3.0, 2.0]), np.array([1.0, 1.0, 1.0]), np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])))",
"expected_output": str(expected4)
})

# Test 5: 3D with scaled covariance
point5 = np.array([2.0, 2.0, 2.0])
mean5 = np.array([0.0, 0.0, 0.0])
cov5 = np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])
expected5 = round(mahalanobis_distance_reference(point5, mean5, cov5), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([2.0, 2.0, 2.0]), np.array([0.0, 0.0, 0.0]), np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])))",
"expected_output": str(expected5)
})

# Test 6: Point at mean (distance should be 0)
point6 = np.array([1.0, 2.0])
mean6 = np.array([1.0, 2.0])
cov6 = np.array([[1.0, 0.0], [0.0, 1.0]])
expected6 = round(mahalanobis_distance_reference(point6, mean6, cov6), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([1.0, 2.0]), np.array([1.0, 2.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))",
"expected_output": str(expected6)
})

# Test 7: 4D case
point7 = np.array([1.0, 2.0, 3.0, 4.0])
mean7 = np.array([0.0, 0.0, 0.0, 0.0])
cov7 = np.eye(4)
expected7 = round(mahalanobis_distance_reference(point7, mean7, cov7), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([1.0, 2.0, 3.0, 4.0]), np.array([0.0, 0.0, 0.0, 0.0]), np.eye(4)))",
"expected_output": str(expected7)
})

# Test 8: Negative coordinates
point8 = np.array([-2.0, 3.0])
mean8 = np.array([1.0, 1.0])
cov8 = np.array([[1.0, 0.5], [0.5, 1.0]])
expected8 = round(mahalanobis_distance_reference(point8, mean8, cov8), 4)
tests.append({
"test": "print(mahalanobis_distance(np.array([-2.0, 3.0]), np.array([1.0, 1.0]), np.array([[1.0, 0.5], [0.5, 1.0]])))",
"expected_output": str(expected8)
})

return tests


def main():
"""Generate tests and write to tests.json."""
tests = generate_tests()

# Get the directory where this script is located
script_dir = Path(__file__).parent
output_file = script_dir / "tests.json"

# Write to JSON file
with open(output_file, "w") as f:
json.dump(tests, f, indent=2)

print(f"✓ Generated {len(tests)} test cases")
print(f"✓ Written to {output_file}")


if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions questions/189_mahalanobis-distance/learn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
## Mahalanobis Distance

The Mahalanobis distance is a fundamental metric in multivariate statistics and machine learning used to measure the distance between a point and a probability distribution. Unlike Euclidean distance, which treats all dimensions equally, the Mahalanobis distance accounts for the correlations and scales of different features via the covariance matrix.

### Formula

The Mahalanobis distance from a point $x$ to a distribution with mean $\mu$ and covariance matrix $\Sigma$ is defined as:

$$D_M = \sqrt{(x - \mu)^T \Sigma^{-1} (x - \mu)}$$

### Components

- **$(x - \mu)$**: The deviation vector from the point to the mean
- **$\Sigma^{-1}$**: The inverse of the covariance matrix, which accounts for variance and correlation
- **$(x - \mu)^T \Sigma^{-1} (x - \mu)$**: A quadratic form that measures standardized distance

### Properties

1. **Scale-Invariant**: Takes into account the variance along each dimension
2. **Correlation-Aware**: Accounts for correlations between features through the covariance matrix
3. **Special Case**: When the covariance matrix is the identity matrix, Mahalanobis distance reduces to Euclidean distance
4. **Outlier Detection**: Points with large Mahalanobis distance are statistical outliers relative to the distribution

### Applications

- **Outlier Detection**: Identifying observations that deviate significantly from the distribution
- **Clustering**: Measuring distances in multivariate Gaussian mixture models
- **Classification**: Computing distances in discriminant analysis and Mahalanobis distance classifiers
- **Data Quality**: Detecting anomalies and unusual patterns in multivariate datasets

### Computational Steps

1. Compute the deviation vector: $d = x - \mu$
2. Compute the inverse of the covariance matrix: $\Sigma^{-1}$
3. Compute the quadratic form: $d^T \Sigma^{-1} d$
4. Take the square root to obtain the distance: $\sqrt{d^T \Sigma^{-1} d}$

### Example

For a 2D distribution with mean $\mu = [0, 0]$ and covariance matrix $\Sigma = \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix}$ (identity matrix), a point $x = [3, 4]$ has Mahalanobis distance:

$$D_M = \sqrt{[3, 4] \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix} [3, 4]^T} = \sqrt{9 + 16} = 5$$

This is equivalent to the Euclidean distance since the covariance is identity.
12 changes: 12 additions & 0 deletions questions/189_mahalanobis-distance/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"id": "189",
"title": "Calculating Mahalanobis Distance",
"difficulty": "medium",
"category": "Machine Learning",
"video": "",
"likes": "0",
"dislikes": "0",
"contributor": [],
"tinygrad_difficulty": "medium",
"pytorch_difficulty": "medium"
}
34 changes: 34 additions & 0 deletions questions/189_mahalanobis-distance/solution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import numpy as np

def mahalanobis_distance(point: np.ndarray, mean: np.ndarray, cov_matrix: np.ndarray) -> float:
"""
Calculate the Mahalanobis distance between a point and a probability distribution.

Args:
point (np.ndarray): A 1D numpy array representing the point in the feature space.
Shape: (n_features,)
mean (np.ndarray): A 1D numpy array representing the mean of the distribution.
Shape: (n_features,)
cov_matrix (np.ndarray): A 2D numpy array representing the covariance matrix.
Shape: (n_features, n_features)

Returns:
float: The Mahalanobis distance, computed as sqrt((point - mean)^T * cov_matrix^(-1) * (point - mean))
A scalar representing how many standard deviations the point is from the mean,
accounting for correlation between features.

"""
# Compute deviation vector
diff = point - mean

# Compute inverse of covariance matrix
try:
inv_cov = np.linalg.inv(cov_matrix)
except np.linalg.LinAlgError:
# Handle singular covariance matrix using pseudo-inverse
inv_cov = np.linalg.pinv(cov_matrix)

# Compute Mahalanobis distance: sqrt((x - mu)^T * Sigma^(-1) * (x - mu))
distance = np.sqrt(np.dot(np.dot(diff, inv_cov), diff))

return round(float(distance), 4)
23 changes: 23 additions & 0 deletions questions/189_mahalanobis-distance/starter_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy as np

def mahalanobis_distance(point: np.ndarray, mean: np.ndarray, cov_matrix: np.ndarray) -> float:
"""
Calculate the Mahalanobis distance between a point and a probability distribution.

Args:
point (np.ndarray): A 1D numpy array representing the point in the feature space.
Shape: (n_features,)
mean (np.ndarray): A 1D numpy array representing the mean of the distribution.
Shape: (n_features,)
cov_matrix (np.ndarray): A 2D numpy array representing the covariance matrix.
Shape: (n_features, n_features)

Returns:
float: The Mahalanobis distance, rounded to 4 decimal places.
Computed as sqrt((point - mean)^T * cov_matrix^(-1) * (point - mean)).
A scalar representing how many standard deviations the point is from the mean,
accounting for correlation between features.

"""
# Your code here
pass
34 changes: 34 additions & 0 deletions questions/189_mahalanobis-distance/tests.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[
{
"test": "print(mahalanobis_distance(np.array([3.0, 4.0]), np.array([0.0, 0.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))",
"expected_output": "5.0"
},
{
"test": "print(mahalanobis_distance(np.array([2.0, 2.0]), np.array([0.0, 0.0]), np.array([[4.0, 0.0], [0.0, 1.0]])))",
"expected_output": "2.2361"
},
{
"test": "print(mahalanobis_distance(np.array([1.0, 1.0]), np.array([0.0, 0.0]), np.array([[2.0, 1.0], [1.0, 2.0]])))",
"expected_output": "0.8165"
},
{
"test": "print(mahalanobis_distance(np.array([5.0, 3.0, 2.0]), np.array([1.0, 1.0, 1.0]), np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])))",
"expected_output": "4.5826"
},
{
"test": "print(mahalanobis_distance(np.array([2.0, 2.0, 2.0]), np.array([0.0, 0.0, 0.0]), np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])))",
"expected_output": "2.4495"
},
{
"test": "print(mahalanobis_distance(np.array([1.0, 2.0]), np.array([1.0, 2.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))",
"expected_output": "0.0"
},
{
"test": "print(mahalanobis_distance(np.array([1.0, 2.0, 3.0, 4.0]), np.array([0.0, 0.0, 0.0, 0.0]), np.eye(4)))",
"expected_output": "5.4772"
},
{
"test": "print(mahalanobis_distance(np.array([-2.0, 3.0]), np.array([1.0, 1.0]), np.array([[1.0, 0.5], [0.5, 1.0]])))",
"expected_output": "5.0332"
}
]