diff --git a/questions/189_mahalanobis-distance/__pycache__/solution.cpython-313.pyc b/questions/189_mahalanobis-distance/__pycache__/solution.cpython-313.pyc new file mode 100644 index 00000000..40da9aa9 Binary files /dev/null and b/questions/189_mahalanobis-distance/__pycache__/solution.cpython-313.pyc differ diff --git a/questions/189_mahalanobis-distance/description.md b/questions/189_mahalanobis-distance/description.md new file mode 100644 index 00000000..d8632f3a --- /dev/null +++ b/questions/189_mahalanobis-distance/description.md @@ -0,0 +1,10 @@ +Write a Python function that calculates the Mahalanobis distance between a point and a probability distribution. The function should take a point, the mean vector of the distribution, and the covariance matrix as inputs and return the Mahalanobis distance as a scalar value. The Mahalanobis distance is a measure of how many standard deviations away a point is from the mean, considering the correlations in the data (represented by the covariance matrix). + +The function inputs should be in NumPy format (numpy.ndarray): +- `point`: A 1D NumPy array representing the point in the feature space +- `mean`: A 1D NumPy array representing the mean of the distribution +- `cov_matrix`: A 2D NumPy array representing the covariance matrix of the distribution + +The function should return a float representing the Mahalanobis distance computed as: $D_M = \sqrt{(x - \mu)^T \Sigma^{-1} (x - \mu)}$ + +**Note:** The return value should be rounded to 4 decimal places. diff --git a/questions/189_mahalanobis-distance/example.json b/questions/189_mahalanobis-distance/example.json new file mode 100644 index 00000000..f1bec123 --- /dev/null +++ b/questions/189_mahalanobis-distance/example.json @@ -0,0 +1,5 @@ +{ + "input": "point = np.array([3.0, 4.0, 2.0]),\nmean = np.array([0.0, 0.0, 0.0]),\ncov_matrix = np.array([[1.0, 0.0, 0.0],\n [0.0, 1.0, 0.0],\n [0.0, 0.0, 1.0]])", + "output": "5.744562646538029", + "reasoning": "With an identity covariance matrix, the Mahalanobis distance reduces to the Euclidean distance. The point [3, 4, 2] is at distance sqrt(3^2 + 4^2 + 2^2) = sqrt(9 + 16 + 4) = sqrt(29) ≈ 5.745 from the origin [0, 0, 0]. The covariance matrix being identity means there is no correlation between dimensions and all dimensions have unit variance, so the standardized distance equals the Euclidean distance." +} diff --git a/questions/189_mahalanobis-distance/example.md b/questions/189_mahalanobis-distance/example.md new file mode 100644 index 00000000..8da07503 --- /dev/null +++ b/questions/189_mahalanobis-distance/example.md @@ -0,0 +1,21 @@ +# Example: Mahalanobis Distance + +## Example Input + +```python +point = np.array([3.0, 4.0, 2.0]) +mean = np.array([0.0, 0.0, 0.0]) +cov_matrix = np.array([[1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0]]) +``` + +## Example Output + +``` +5.744562646538029 +``` + +## Reasoning + +With an identity covariance matrix, the Mahalanobis distance reduces to the Euclidean distance. The point [3, 4, 2] is at distance sqrt(3^2 + 4^2 + 2^2) = sqrt(9 + 16 + 4) = sqrt(29) ≈ 5.745 from the origin [0, 0, 0]. The covariance matrix being identity means there is no correlation between dimensions and all dimensions have unit variance, so the standardized distance equals the Euclidean distance. diff --git a/questions/189_mahalanobis-distance/generate_tests.py b/questions/189_mahalanobis-distance/generate_tests.py new file mode 100644 index 00000000..aafe6bed --- /dev/null +++ b/questions/189_mahalanobis-distance/generate_tests.py @@ -0,0 +1,125 @@ +""" +Generate test cases for the Mahalanobis distance problem. +This script creates comprehensive test cases and writes them to tests.json. +""" +import json +import numpy as np +from pathlib import Path + + +def mahalanobis_distance_reference(point, mean, cov_matrix): + """Reference implementation for generating expected outputs.""" + diff = point - mean + try: + inv_cov = np.linalg.inv(cov_matrix) + except np.linalg.LinAlgError: + inv_cov = np.linalg.pinv(cov_matrix) + distance = np.sqrt(np.dot(np.dot(diff, inv_cov), diff)) + return float(distance) + + +def generate_tests(): + """Generate diverse test cases for Mahalanobis distance.""" + tests = [] + + # Test 1: Identity covariance (Euclidean distance) + point1 = np.array([3.0, 4.0]) + mean1 = np.array([0.0, 0.0]) + cov1 = np.array([[1.0, 0.0], [0.0, 1.0]]) + expected1 = round(mahalanobis_distance_reference(point1, mean1, cov1), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([3.0, 4.0]), np.array([0.0, 0.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))", + "expected_output": str(expected1) + }) + + # Test 2: Scaled identity covariance (2D) + point2 = np.array([2.0, 2.0]) + mean2 = np.array([0.0, 0.0]) + cov2 = np.array([[4.0, 0.0], [0.0, 1.0]]) + expected2 = round(mahalanobis_distance_reference(point2, mean2, cov2), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([2.0, 2.0]), np.array([0.0, 0.0]), np.array([[4.0, 0.0], [0.0, 1.0]])))", + "expected_output": str(expected2) + }) + + # Test 3: Non-diagonal covariance (with correlation) + point3 = np.array([1.0, 1.0]) + mean3 = np.array([0.0, 0.0]) + cov3 = np.array([[2.0, 1.0], [1.0, 2.0]]) + expected3 = round(mahalanobis_distance_reference(point3, mean3, cov3), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([1.0, 1.0]), np.array([0.0, 0.0]), np.array([[2.0, 1.0], [1.0, 2.0]])))", + "expected_output": str(expected3) + }) + + # Test 4: 3D case with non-zero mean + point4 = np.array([5.0, 3.0, 2.0]) + mean4 = np.array([1.0, 1.0, 1.0]) + cov4 = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + expected4 = round(mahalanobis_distance_reference(point4, mean4, cov4), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([5.0, 3.0, 2.0]), np.array([1.0, 1.0, 1.0]), np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])))", + "expected_output": str(expected4) + }) + + # Test 5: 3D with scaled covariance + point5 = np.array([2.0, 2.0, 2.0]) + mean5 = np.array([0.0, 0.0, 0.0]) + cov5 = np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]]) + expected5 = round(mahalanobis_distance_reference(point5, mean5, cov5), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([2.0, 2.0, 2.0]), np.array([0.0, 0.0, 0.0]), np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])))", + "expected_output": str(expected5) + }) + + # Test 6: Point at mean (distance should be 0) + point6 = np.array([1.0, 2.0]) + mean6 = np.array([1.0, 2.0]) + cov6 = np.array([[1.0, 0.0], [0.0, 1.0]]) + expected6 = round(mahalanobis_distance_reference(point6, mean6, cov6), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([1.0, 2.0]), np.array([1.0, 2.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))", + "expected_output": str(expected6) + }) + + # Test 7: 4D case + point7 = np.array([1.0, 2.0, 3.0, 4.0]) + mean7 = np.array([0.0, 0.0, 0.0, 0.0]) + cov7 = np.eye(4) + expected7 = round(mahalanobis_distance_reference(point7, mean7, cov7), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([1.0, 2.0, 3.0, 4.0]), np.array([0.0, 0.0, 0.0, 0.0]), np.eye(4)))", + "expected_output": str(expected7) + }) + + # Test 8: Negative coordinates + point8 = np.array([-2.0, 3.0]) + mean8 = np.array([1.0, 1.0]) + cov8 = np.array([[1.0, 0.5], [0.5, 1.0]]) + expected8 = round(mahalanobis_distance_reference(point8, mean8, cov8), 4) + tests.append({ + "test": "print(mahalanobis_distance(np.array([-2.0, 3.0]), np.array([1.0, 1.0]), np.array([[1.0, 0.5], [0.5, 1.0]])))", + "expected_output": str(expected8) + }) + + return tests + + +def main(): + """Generate tests and write to tests.json.""" + tests = generate_tests() + + # Get the directory where this script is located + script_dir = Path(__file__).parent + output_file = script_dir / "tests.json" + + # Write to JSON file + with open(output_file, "w") as f: + json.dump(tests, f, indent=2) + + print(f"✓ Generated {len(tests)} test cases") + print(f"✓ Written to {output_file}") + + +if __name__ == "__main__": + main() diff --git a/questions/189_mahalanobis-distance/learn.md b/questions/189_mahalanobis-distance/learn.md new file mode 100644 index 00000000..5ba88cd9 --- /dev/null +++ b/questions/189_mahalanobis-distance/learn.md @@ -0,0 +1,44 @@ +## Mahalanobis Distance + +The Mahalanobis distance is a fundamental metric in multivariate statistics and machine learning used to measure the distance between a point and a probability distribution. Unlike Euclidean distance, which treats all dimensions equally, the Mahalanobis distance accounts for the correlations and scales of different features via the covariance matrix. + +### Formula + +The Mahalanobis distance from a point $x$ to a distribution with mean $\mu$ and covariance matrix $\Sigma$ is defined as: + +$$D_M = \sqrt{(x - \mu)^T \Sigma^{-1} (x - \mu)}$$ + +### Components + +- **$(x - \mu)$**: The deviation vector from the point to the mean +- **$\Sigma^{-1}$**: The inverse of the covariance matrix, which accounts for variance and correlation +- **$(x - \mu)^T \Sigma^{-1} (x - \mu)$**: A quadratic form that measures standardized distance + +### Properties + +1. **Scale-Invariant**: Takes into account the variance along each dimension +2. **Correlation-Aware**: Accounts for correlations between features through the covariance matrix +3. **Special Case**: When the covariance matrix is the identity matrix, Mahalanobis distance reduces to Euclidean distance +4. **Outlier Detection**: Points with large Mahalanobis distance are statistical outliers relative to the distribution + +### Applications + +- **Outlier Detection**: Identifying observations that deviate significantly from the distribution +- **Clustering**: Measuring distances in multivariate Gaussian mixture models +- **Classification**: Computing distances in discriminant analysis and Mahalanobis distance classifiers +- **Data Quality**: Detecting anomalies and unusual patterns in multivariate datasets + +### Computational Steps + +1. Compute the deviation vector: $d = x - \mu$ +2. Compute the inverse of the covariance matrix: $\Sigma^{-1}$ +3. Compute the quadratic form: $d^T \Sigma^{-1} d$ +4. Take the square root to obtain the distance: $\sqrt{d^T \Sigma^{-1} d}$ + +### Example + +For a 2D distribution with mean $\mu = [0, 0]$ and covariance matrix $\Sigma = \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix}$ (identity matrix), a point $x = [3, 4]$ has Mahalanobis distance: + +$$D_M = \sqrt{[3, 4] \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix} [3, 4]^T} = \sqrt{9 + 16} = 5$$ + +This is equivalent to the Euclidean distance since the covariance is identity. diff --git a/questions/189_mahalanobis-distance/meta.json b/questions/189_mahalanobis-distance/meta.json new file mode 100644 index 00000000..32b6953a --- /dev/null +++ b/questions/189_mahalanobis-distance/meta.json @@ -0,0 +1,12 @@ +{ + "id": "189", + "title": "Calculating Mahalanobis Distance", + "difficulty": "medium", + "category": "Machine Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [], + "tinygrad_difficulty": "medium", + "pytorch_difficulty": "medium" +} diff --git a/questions/189_mahalanobis-distance/solution.py b/questions/189_mahalanobis-distance/solution.py new file mode 100644 index 00000000..584150f7 --- /dev/null +++ b/questions/189_mahalanobis-distance/solution.py @@ -0,0 +1,34 @@ +import numpy as np + +def mahalanobis_distance(point: np.ndarray, mean: np.ndarray, cov_matrix: np.ndarray) -> float: + """ + Calculate the Mahalanobis distance between a point and a probability distribution. + + Args: + point (np.ndarray): A 1D numpy array representing the point in the feature space. + Shape: (n_features,) + mean (np.ndarray): A 1D numpy array representing the mean of the distribution. + Shape: (n_features,) + cov_matrix (np.ndarray): A 2D numpy array representing the covariance matrix. + Shape: (n_features, n_features) + + Returns: + float: The Mahalanobis distance, computed as sqrt((point - mean)^T * cov_matrix^(-1) * (point - mean)) + A scalar representing how many standard deviations the point is from the mean, + accounting for correlation between features. + + """ + # Compute deviation vector + diff = point - mean + + # Compute inverse of covariance matrix + try: + inv_cov = np.linalg.inv(cov_matrix) + except np.linalg.LinAlgError: + # Handle singular covariance matrix using pseudo-inverse + inv_cov = np.linalg.pinv(cov_matrix) + + # Compute Mahalanobis distance: sqrt((x - mu)^T * Sigma^(-1) * (x - mu)) + distance = np.sqrt(np.dot(np.dot(diff, inv_cov), diff)) + + return round(float(distance), 4) diff --git a/questions/189_mahalanobis-distance/starter_code.py b/questions/189_mahalanobis-distance/starter_code.py new file mode 100644 index 00000000..9db6c9f3 --- /dev/null +++ b/questions/189_mahalanobis-distance/starter_code.py @@ -0,0 +1,23 @@ +import numpy as np + +def mahalanobis_distance(point: np.ndarray, mean: np.ndarray, cov_matrix: np.ndarray) -> float: + """ + Calculate the Mahalanobis distance between a point and a probability distribution. + + Args: + point (np.ndarray): A 1D numpy array representing the point in the feature space. + Shape: (n_features,) + mean (np.ndarray): A 1D numpy array representing the mean of the distribution. + Shape: (n_features,) + cov_matrix (np.ndarray): A 2D numpy array representing the covariance matrix. + Shape: (n_features, n_features) + + Returns: + float: The Mahalanobis distance, rounded to 4 decimal places. + Computed as sqrt((point - mean)^T * cov_matrix^(-1) * (point - mean)). + A scalar representing how many standard deviations the point is from the mean, + accounting for correlation between features. + + """ + # Your code here + pass diff --git a/questions/189_mahalanobis-distance/tests.json b/questions/189_mahalanobis-distance/tests.json new file mode 100644 index 00000000..3aa8bfc0 --- /dev/null +++ b/questions/189_mahalanobis-distance/tests.json @@ -0,0 +1,34 @@ +[ + { + "test": "print(mahalanobis_distance(np.array([3.0, 4.0]), np.array([0.0, 0.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))", + "expected_output": "5.0" + }, + { + "test": "print(mahalanobis_distance(np.array([2.0, 2.0]), np.array([0.0, 0.0]), np.array([[4.0, 0.0], [0.0, 1.0]])))", + "expected_output": "2.2361" + }, + { + "test": "print(mahalanobis_distance(np.array([1.0, 1.0]), np.array([0.0, 0.0]), np.array([[2.0, 1.0], [1.0, 2.0]])))", + "expected_output": "0.8165" + }, + { + "test": "print(mahalanobis_distance(np.array([5.0, 3.0, 2.0]), np.array([1.0, 1.0, 1.0]), np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])))", + "expected_output": "4.5826" + }, + { + "test": "print(mahalanobis_distance(np.array([2.0, 2.0, 2.0]), np.array([0.0, 0.0, 0.0]), np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])))", + "expected_output": "2.4495" + }, + { + "test": "print(mahalanobis_distance(np.array([1.0, 2.0]), np.array([1.0, 2.0]), np.array([[1.0, 0.0], [0.0, 1.0]])))", + "expected_output": "0.0" + }, + { + "test": "print(mahalanobis_distance(np.array([1.0, 2.0, 3.0, 4.0]), np.array([0.0, 0.0, 0.0, 0.0]), np.eye(4)))", + "expected_output": "5.4772" + }, + { + "test": "print(mahalanobis_distance(np.array([-2.0, 3.0]), np.array([1.0, 1.0]), np.array([[1.0, 0.5], [0.5, 1.0]])))", + "expected_output": "5.0332" + } +] \ No newline at end of file