openml · Nidhicodes · Feb 20, 2026 · Feb 20, 2026 · sourcery-ai · Feb 20, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,8 @@ dependencies = [
     "mysqlclient",
     "python_dotenv",
     "xmltodict",
+    "liac-arff",
+    "pyarrow",
 ]
 
 [project.optional-dependencies]

diff --git a/src/core/feature_analysis.py b/src/core/feature_analysis.py
@@ -0,0 +1,163 @@
+from collections.abc import Iterable
+from typing import IO
+
+import arff
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from schemas.datasets.openml import Feature, FeatureType
+
+
+def analyze_arff(
+    arff_stream: IO[str],
+    target_features: Iterable[str] | None = None,
+    ignore_features: Iterable[str] | None = None,
+    row_id_features: Iterable[str] | None = None,
+) -> list[Feature]:
+    """Analyze an ARFF file and return a list of Feature objects."""
+    dataset = arff.load(arff_stream)
+    attributes = dataset["attributes"]
+    data = dataset["data"]
+
+    target_features = set(target_features or [])
+    ignore_features = set(ignore_features or [])
+    row_id_features = set(row_id_features or [])
+
+    features = []
+    for i, (name, type_info) in enumerate(attributes):
+        if isinstance(type_info, list):
+            data_type = FeatureType.NOMINAL
+            nominal_values = type_info
+        elif type_info.upper() in ("NUMERIC", "REAL", "INTEGER"):
+            data_type = FeatureType.NUMERIC
+            nominal_values = None
+        elif type_info.upper() == "STRING":
+            data_type = FeatureType.STRING
+            nominal_values = None
+        else:
+            # Fallback or handle other types if necessary
+            data_type = FeatureType.STRING
+            nominal_values = None
+
+        # Count missing values
+        missing_count = 0
+        if data:
+            for row in data:
+                # In liac-arff, data can be a list of lists or a list of dictionaries (for sparse)
+                if isinstance(row, dict):
+                    # Sparse format: only present values are in the dict
+                    if i not in row:
+                        # In sparse ARFF, if an index is missing from the dict,
+                        # it means it has the default value, which is usually 0.
+                        # sparse ARFF in liac-arff:
+                        # {index: value, ...}
+                        # If it's missing, it's 0.
+                        # Missing values are represented as None in the dict if explicitly present.
+                        # OpenML's sparse ARFF uses {index value, ...}
+                        # and missing values are simply not there if they are 0,
+                        # but if they are really missing they should be there as '?'
+                        # which liac-arff converts to None.
+                        pass
+                    elif row[i] is None:
+                        missing_count += 1
+                elif row[i] is None:
+                    missing_count += 1
+
+        features.append(
+            Feature(
+                index=i,
+                name=name,
+                data_type=data_type,
+                is_target=name in target_features,
+                is_ignore=name in ignore_features,
+                is_row_identifier=name in row_id_features,
+                number_of_missing_values=missing_count,
+                nominal_values=nominal_values,
+            ),
+        )
+    return features
+
+
+def analyze_parquet(
+    source: str | IO[bytes],
+    target_features: Iterable[str] | None = None,
+    ignore_features: Iterable[str] | None = None,
+    row_id_features: Iterable[str] | None = None,
+) -> list[Feature]:
+    """Analyze a Parquet file and return a list of Feature objects."""
+    table = pq.read_table(source)
+    schema = table.schema
+
+    target_features = set(target_features or [])
+    ignore_features = set(ignore_features or [])
+    row_id_features = set(row_id_features or [])
+
+    features = []
+    for i, field in enumerate(schema):
+        name = field.name
+        pa_type = field.type
+
+        # Determine data_type and nominal_values
+        nominal_values = None
+        if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
+            data_type = FeatureType.NUMERIC
+        elif pa.types.is_dictionary(pa_type):
+            data_type = FeatureType.NOMINAL
+            # Extract nominal values from dictionary
+            # We need to look at the data to get the dictionary values
+            column_data = table.column(i)
+            # A column can have multiple chunks
+            unique_values = set()
+            for chunk in column_data.chunks:
+                dictionary = chunk.dictionary
+                for val in dictionary:
+                    unique_values.add(val.as_py())
+            nominal_values = sorted(list(unique_values))
+        elif pa.types.is_string(pa_type) or pa.types.is_boolean(pa_type):
+            # For Parquet, strings might be nominal if they don't have a dictionary
+            # We needed to "Extract unique values from the data" for nominals in non-ARFF
+            # In OpenML, if it's used for classification, it's nominal.
+            # If we don't know, we might have to guess or treat all strings as nominal if they have
+            # few unique values.
+
+            # For Parquet, let's assume if it's not numeric, it's nominal for now,
+            # as that's common in ML datasets, unless it's explicitly string.
+
+            # If it's boolean, it's definitely nominal [False, True].
+            if pa.types.is_boolean(pa_type):
+                data_type = FeatureType.NOMINAL
+                nominal_values = ["false", "true"]
+            else:
+                # For string, let's extract unique values and see.
+                column_data = table.column(i)
+                unique_values = set()
+                # For efficiency, we might not want to scan everything if it's huge
+                for chunk in column_data.chunks:
+                    for val in chunk.unique():
+                        v = val.as_py()
+                        if v is not None:
+                            unique_values.add(str(v))
+
+                # OpenML usually has a threshold, but let's just call it nominal if it's string
+                data_type = FeatureType.NOMINAL
+                nominal_values = sorted(list(unique_values))
+        else:
+            data_type = FeatureType.STRING
+            nominal_values = None
+
+        # Count missing values
+        missing_count = table.column(i).null_count
+
+        features.append(
+            Feature(
+                index=i,
+                name=name,
+                data_type=data_type,
+                is_target=name in target_features,
+                is_ignore=name in ignore_features,
+                is_row_identifier=name in row_id_features,
+                number_of_missing_values=missing_count,
+                nominal_values=nominal_values,
+            ),
+        )
+    return features
diff --git a/tests/core/test_feature_analysis.py b/tests/core/test_feature_analysis.py
@@ -0,0 +1,138 @@
+from io import BytesIO, StringIO
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from core.feature_analysis import analyze_arff, analyze_parquet
+from schemas.datasets.openml import FeatureType
+
+
+def test_analyze_arff_dense():
+    arff_data = """@RELATION test
+@ATTRIBUTE a NUMERIC
+@ATTRIBUTE b {x,y}
+@ATTRIBUTE c STRING
+@DATA
+1,x,hello
+2,y,world
+?,? ,?
+"""
+    features = analyze_arff(StringIO(arff_data))
+    assert len(features) == 3
+
+    assert features[0].name == "a"
+    assert features[0].data_type == FeatureType.NUMERIC
+    assert features[0].number_of_missing_values == 1
+    assert features[0].nominal_values is None
+
+    assert features[1].name == "b"
+    assert features[1].data_type == FeatureType.NOMINAL
+    assert features[1].nominal_values == ["x", "y"]
+    assert features[1].number_of_missing_values == 1
+
+    assert features[2].name == "c"
+    assert features[2].data_type == FeatureType.STRING
+    assert features[2].number_of_missing_values == 1
+
+
+def test_analyze_arff_sparse():
+    arff_data = """@RELATION test
+@ATTRIBUTE a NUMERIC
+@ATTRIBUTE b NUMERIC
+@ATTRIBUTE c {X,Y}
+@DATA
+{0 1, 2 X}
+{1 5}
+{0 ?, 2 ?}
+"""
+    features = analyze_arff(StringIO(arff_data))
+    assert len(features) == 3
+
+    # index 0: 1, missing(0), ? -> 1 missing
+    assert features[0].name == "a"
+    assert features[0].number_of_missing_values == 1
+
+    # index 1: missing(0), 5, missing(0) -> 0 missing
+    assert features[1].name == "b"
+    assert features[1].number_of_missing_values == 0
+
+    # index 2: X, missing(None?), ?
+    # Row 1: {1 5} -> index 2 is missing. In sparse ARFF, if it's missing it's the 0-th element for nominal.
+    assert features[2].name == "c"
+    assert features[2].number_of_missing_values == 1  # Only from row 2 {0 ?, 2 ?}
+
+
+def test_analyze_arff_sparse_all_missing():
+    arff_data = """@RELATION sparse
+@ATTRIBUTE a NUMERIC
+@DATA
+{0 ?}
+?
+{}
+"""
+    # row 0: ? -> missing
+    # row 1: ? -> missing
+    # row 2: {} -> index 0 is missing from dict -> default (0) -> NOT missing
+    features = analyze_arff(StringIO(arff_data))
+    assert features[0].number_of_missing_values == 2
+
+
+def test_analyze_arff_metadata():
+    arff_data = """@RELATION test
+@ATTRIBUTE a NUMERIC
+@ATTRIBUTE b NUMERIC
+@ATTRIBUTE c NUMERIC
+@DATA
+1,2,3
+"""
+    features = analyze_arff(
+        StringIO(arff_data), target_features=["c"], ignore_features=["b"], row_id_features=["a"]
+    )
+    assert features[0].is_row_identifier is True
+    assert features[1].is_ignore is True
+    assert features[2].is_target is True
+    assert features[0].is_target is False
+
+
+def test_analyze_parquet():
+    data = [
+        pa.array([1, 2, None]),
+        pa.array(["cat", "dog", "cat"]),
+        pa.array([True, False, None]),
+        pa.array(["v1", "v2", "v3"], type=pa.dictionary(pa.int8(), pa.string())),
+    ]
+    schema = pa.schema(
+        [
+            ("f1", pa.int64()),
+            ("f2", pa.string()),
+            ("f3", pa.bool_()),
+            ("f4", pa.dictionary(pa.int8(), pa.string())),
+        ]
+    )
+    table = pa.Table.from_arrays(data, schema=schema)
+
+    buf = BytesIO()
+    pq.write_table(table, buf)
+    buf.seek(0)
+
+    features = analyze_parquet(buf, target_features=["f3"])
+
+    assert len(features) == 4
+
+    assert features[0].name == "f1"
+    assert features[0].data_type == FeatureType.NUMERIC
+    assert features[0].number_of_missing_values == 1
+
+    assert features[1].name == "f2"
+    assert features[1].data_type == FeatureType.NOMINAL
+    assert sorted(features[1].nominal_values) == ["cat", "dog"]
+
+    assert features[2].name == "f3"
+    assert features[2].data_type == FeatureType.NOMINAL
+    assert features[2].is_target is True
+    assert features[2].number_of_missing_values == 1
+    assert features[2].nominal_values == ["false", "true"]
+
+    assert features[3].name == "f4"
+    assert features[3].data_type == FeatureType.NOMINAL
+    assert sorted(features[3].nominal_values) == ["v1", "v2", "v3"]