Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies = [
"mysqlclient",
"python_dotenv",
"xmltodict",
"liac-arff",
"pyarrow",
]

[project.optional-dependencies]
Expand Down
163 changes: 163 additions & 0 deletions src/core/feature_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from collections.abc import Iterable
from typing import IO

import arff
import pyarrow as pa
import pyarrow.parquet as pq

from schemas.datasets.openml import Feature, FeatureType


def analyze_arff(
arff_stream: IO[str],
target_features: Iterable[str] | None = None,
ignore_features: Iterable[str] | None = None,
row_id_features: Iterable[str] | None = None,
) -> list[Feature]:
"""Analyze an ARFF file and return a list of Feature objects."""
dataset = arff.load(arff_stream)
attributes = dataset["attributes"]
data = dataset["data"]

target_features = set(target_features or [])
ignore_features = set(ignore_features or [])
row_id_features = set(row_id_features or [])

features = []
for i, (name, type_info) in enumerate(attributes):
if isinstance(type_info, list):
data_type = FeatureType.NOMINAL
nominal_values = type_info
elif type_info.upper() in ("NUMERIC", "REAL", "INTEGER"):
data_type = FeatureType.NUMERIC
nominal_values = None
elif type_info.upper() == "STRING":
data_type = FeatureType.STRING
nominal_values = None
else:
# Fallback or handle other types if necessary
data_type = FeatureType.STRING
nominal_values = None

# Count missing values
missing_count = 0
if data:
for row in data:
# In liac-arff, data can be a list of lists or a list of dictionaries (for sparse)
if isinstance(row, dict):
# Sparse format: only present values are in the dict
if i not in row:
# In sparse ARFF, if an index is missing from the dict,
# it means it has the default value, which is usually 0.
# sparse ARFF in liac-arff:
# {index: value, ...}
# If it's missing, it's 0.
# Missing values are represented as None in the dict if explicitly present.
# OpenML's sparse ARFF uses {index value, ...}
# and missing values are simply not there if they are 0,
# but if they are really missing they should be there as '?'
# which liac-arff converts to None.
pass
elif row[i] is None:
missing_count += 1
elif row[i] is None:
missing_count += 1

features.append(
Feature(
index=i,
name=name,
data_type=data_type,
is_target=name in target_features,
is_ignore=name in ignore_features,
is_row_identifier=name in row_id_features,
number_of_missing_values=missing_count,
nominal_values=nominal_values,
),
)
return features


def analyze_parquet(
source: str | IO[bytes],
target_features: Iterable[str] | None = None,
ignore_features: Iterable[str] | None = None,
row_id_features: Iterable[str] | None = None,
) -> list[Feature]:
"""Analyze a Parquet file and return a list of Feature objects."""
table = pq.read_table(source)
schema = table.schema

target_features = set(target_features or [])
ignore_features = set(ignore_features or [])
row_id_features = set(row_id_features or [])

features = []
for i, field in enumerate(schema):
name = field.name
pa_type = field.type

# Determine data_type and nominal_values
nominal_values = None
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
data_type = FeatureType.NUMERIC
elif pa.types.is_dictionary(pa_type):
data_type = FeatureType.NOMINAL
# Extract nominal values from dictionary
# We need to look at the data to get the dictionary values
column_data = table.column(i)
# A column can have multiple chunks
unique_values = set()
for chunk in column_data.chunks:
dictionary = chunk.dictionary
for val in dictionary:
unique_values.add(val.as_py())
nominal_values = sorted(list(unique_values))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (performance): Treating all string columns as nominal and enumerating all unique values may not scale for high-cardinality text.

This branch treats all string-typed Parquet columns as nominal and builds nominal_values by calling chunk.unique() over all chunks. For high-cardinality or free-text columns this can be very slow and memory-intensive, producing huge and not very useful nominal_values lists. Consider introducing a cardinality/uniqueness threshold or sampling limit to fall back to STRING instead of NOMINAL, and/or capping the number of unique values collected to avoid pathological cases.

Suggested implementation:

        elif pa.types.is_string(pa_type) or pa.types.is_boolean(pa_type):
            # For Parquet, strings might be nominal if they don't have a dictionary.
            # However, treating all string columns as nominal and enumerating every
            # unique value does not scale for high-cardinality / free-text columns.
            #
            # Heuristic:
            # - Booleans are always nominal: [False, True].
            # - For strings, sample up to a maximum number of rows and track unique
            #   values up to a cardinality cap. If the cap is exceeded, fall back
            #   to treating the column as STRING by not populating nominal_values.
            #
            # Downstream logic should interpret `nominal_values is None` as
            # "not nominal" for string-typed columns.

            MAX_NOMINAL_CARDINALITY = 1000   # Maximum distinct values to treat as nominal
            MAX_NOMINAL_SAMPLE_ROWS = 100000  # Maximum rows to scan per column

            if pa.types.is_boolean(pa_type):
                # Booleans are low-cardinality by definition.
                nominal_values = [False, True]
            else:
                column_data = table.column(i)
                unique_values = set()
                rows_seen = 0

                for chunk in column_data.chunks:
                    if rows_seen >= MAX_NOMINAL_SAMPLE_ROWS:
                        break
                    if len(unique_values) > MAX_NOMINAL_CARDINALITY:
                        break

                    # Respect the sampling limit by slicing the chunk if needed
                    chunk_to_scan = chunk
                    remaining_rows = MAX_NOMINAL_SAMPLE_ROWS - rows_seen
                    if len(chunk_to_scan) > remaining_rows:
                        chunk_to_scan = chunk_to_scan.slice(0, remaining_rows)

                    # Convert the chunk to Python values; skip nulls
                    for val in chunk_to_scan.to_pylist():
                        if val is None:
                            continue
                        unique_values.add(val)
                        if len(unique_values) > MAX_NOMINAL_CARDINALITY:
                            break

                    rows_seen += len(chunk_to_scan)

                    if len(unique_values) > MAX_NOMINAL_CARDINALITY:
                        break

                # If we exceeded the cardinality cap (or saw no values),
                # treat this as a generic string column (no nominal_values).
                if 0 < len(unique_values) <= MAX_NOMINAL_CARDINALITY:
                    nominal_values = sorted(unique_values)
                else:
                    nominal_values = None
  1. Ensure that whatever structure you are building for each feature (e.g. an attribute_type/feature_type or similar) interprets nominal_values is None for string-typed columns as a signal to treat the column as STRING (or equivalent non-nominal type) rather than NOMINAL.
  2. If you already have module-level configuration or constants for thresholds, you may want to hoist MAX_NOMINAL_CARDINALITY and MAX_NOMINAL_SAMPLE_ROWS to that configuration instead of hard-coding them inline.

Comment on lines +108 to +115
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Dictionary entries may include values not present in the data.

chunk.dictionary contains all entries stored in the dictionary, including those that may no longer be referenced by any index in the chunk (e.g., after filtering). This means nominal_values could list values that don't actually appear in the column data. If accurate nominal values are required, use chunk.unique() instead of chunk.dictionary, or filter dictionary entries by referenced indices.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/core/feature_analysis.py` around lines 108 - 115, The code builds
nominal_values by iterating chunk.dictionary which can include unreferenced
entries; update the logic in the feature_analysis code that handles
column_data/chunk (the block that fills unique_values and sets nominal_values)
to derive values from the actual data — either call chunk.unique() and use those
results, or iterate the chunk's indices to filter dictionary entries by
referenced indices — then sort that result into nominal_values so it only
contains values present in the chunk.

elif pa.types.is_string(pa_type) or pa.types.is_boolean(pa_type):
# For Parquet, strings might be nominal if they don't have a dictionary
# We needed to "Extract unique values from the data" for nominals in non-ARFF
# In OpenML, if it's used for classification, it's nominal.
# If we don't know, we might have to guess or treat all strings as nominal if they have
# few unique values.

# For Parquet, let's assume if it's not numeric, it's nominal for now,
# as that's common in ML datasets, unless it's explicitly string.

# If it's boolean, it's definitely nominal [False, True].
if pa.types.is_boolean(pa_type):
data_type = FeatureType.NOMINAL
nominal_values = ["false", "true"]
else:
# For string, let's extract unique values and see.
column_data = table.column(i)
unique_values = set()
# For efficiency, we might not want to scan everything if it's huge
for chunk in column_data.chunks:
for val in chunk.unique():
v = val.as_py()
if v is not None:
unique_values.add(str(v))

# OpenML usually has a threshold, but let's just call it nominal if it's string
data_type = FeatureType.NOMINAL
nominal_values = sorted(list(unique_values))
Comment on lines +130 to +143
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Unbounded unique-value scan on string columns — potential performance/memory issue on high-cardinality data.

For string columns, every value across all chunks is scanned into a set. If a column has millions of distinct values (e.g., free-text, UUIDs), this will consume excessive memory and produce an impractically large nominal_values list. The inline comment on line 134 acknowledges this but doesn't mitigate it.

Consider adding a cardinality threshold: if unique values exceed a limit (e.g., 256), classify as FeatureType.STRING instead of NOMINAL and set nominal_values = None.

🔧 Sketch of a threshold-based approach
                 for chunk in column_data.chunks:
                     for val in chunk.unique():
                         v = val.as_py()
                         if v is not None:
                             unique_values.add(str(v))
+                        if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+                            break
+                    if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+                        break

-                data_type = FeatureType.NOMINAL
-                nominal_values = sorted(unique_values)
+                if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+                    data_type = FeatureType.STRING
+                    nominal_values = None
+                else:
+                    data_type = FeatureType.NOMINAL
+                    nominal_values = sorted(unique_values)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/core/feature_analysis.py` around lines 130 - 143, The current loop in
feature_analysis.py that builds unique_values for string columns (using
column_data.chunks, unique_values, FeatureType.NOMINAL and nominal_values) can
blow up memory on high-cardinality data; modify it to enforce a cardinality
threshold (e.g., MAX_NOMINAL=256): while iterating chunks/values stop adding
once the set size exceeds the threshold, immediately classify the column as
FeatureType.STRING and set nominal_values = None (or an empty/marker value)
instead of continuing to collect uniques; ensure the early-exit prevents
scanning remaining chunks to avoid the unbounded memory/CPU use.

else:
data_type = FeatureType.STRING
nominal_values = None
Comment on lines +100 to +146
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Missing handling for pa.decimal128/pa.decimal256 and date/time types — they'll be classified as STRING.

Parquet files commonly contain decimal and temporal types. Currently these fall through to the else branch (line 144-146) and get classified as FeatureType.STRING with no nominal values. Decimal should be NUMERIC, and date/time types need a deliberate policy.

🔧 Proposed fix for decimal types
-        if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
+        if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type) or pa.types.is_decimal(pa_type):
             data_type = FeatureType.NUMERIC

For temporal types (pa.types.is_date, pa.types.is_time, pa.types.is_timestamp), decide on a classification policy (NUMERIC timestamp or STRING) and handle explicitly rather than silently falling through.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/core/feature_analysis.py` around lines 100 - 146, The current feature
type detection in the feature analysis block misclassifies decimal and temporal
Parquet types as STRING; update the logic in the same branch that checks pa_type
(the block that currently tests
pa.types.is_floating/is_integer/is_dictionary/is_string/is_boolean) to
explicitly detect pa.decimal128/pa.decimal256 (use pa.types.is_decimal128 and
pa.types.is_decimal256 or pa.types.is_decimal) and mark them as
FeatureType.NUMERIC (with appropriate conversion/scale handling for numeric
processing), and add explicit checks for temporal types (pa.types.is_date,
pa.types.is_time, pa.types.is_timestamp) and apply the chosen policy (e.g.,
treat timestamps as FeatureType.NUMERIC or FeatureType.STRING consistently),
ensuring nominal_values is set or left None as appropriate; adjust any
downstream code that expects numeric/date handling accordingly (refer to the
variables pa_type, data_type, nominal_values, and the table.column(i) logic to
extract values if needed).


# Count missing values
missing_count = table.column(i).null_count

features.append(
Feature(
index=i,
name=name,
data_type=data_type,
is_target=name in target_features,
is_ignore=name in ignore_features,
is_row_identifier=name in row_id_features,
number_of_missing_values=missing_count,
nominal_values=nominal_values,
),
)
return features
138 changes: 138 additions & 0 deletions tests/core/test_feature_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from io import BytesIO, StringIO

import pyarrow as pa
import pyarrow.parquet as pq

from core.feature_analysis import analyze_arff, analyze_parquet
from schemas.datasets.openml import FeatureType


def test_analyze_arff_dense():
arff_data = """@RELATION test
@ATTRIBUTE a NUMERIC
@ATTRIBUTE b {x,y}
@ATTRIBUTE c STRING
@DATA
1,x,hello
2,y,world
?,? ,?
"""
Comment on lines +7 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Add a test for ARFF files with a schema but no data rows

Please add coverage for an ARFF file that declares attributes but has an empty @DATA section (no rows). This will confirm analyze_arff safely handles zero-row datasets and reports number_of_missing_values == 0 for all features.

features = analyze_arff(StringIO(arff_data))
assert len(features) == 3

assert features[0].name == "a"
assert features[0].data_type == FeatureType.NUMERIC
assert features[0].number_of_missing_values == 1
assert features[0].nominal_values is None

assert features[1].name == "b"
assert features[1].data_type == FeatureType.NOMINAL
assert features[1].nominal_values == ["x", "y"]
assert features[1].number_of_missing_values == 1

assert features[2].name == "c"
assert features[2].data_type == FeatureType.STRING
assert features[2].number_of_missing_values == 1


def test_analyze_arff_sparse():
arff_data = """@RELATION test
@ATTRIBUTE a NUMERIC
@ATTRIBUTE b NUMERIC
@ATTRIBUTE c {X,Y}
@DATA
{0 1, 2 X}
{1 5}
{0 ?, 2 ?}
"""
features = analyze_arff(StringIO(arff_data))
assert len(features) == 3

# index 0: 1, missing(0), ? -> 1 missing
assert features[0].name == "a"
assert features[0].number_of_missing_values == 1

# index 1: missing(0), 5, missing(0) -> 0 missing
assert features[1].name == "b"
assert features[1].number_of_missing_values == 0

# index 2: X, missing(None?), ?
# Row 1: {1 5} -> index 2 is missing. In sparse ARFF, if it's missing it's the 0-th element for nominal.
assert features[2].name == "c"
assert features[2].number_of_missing_values == 1 # Only from row 2 {0 ?, 2 ?}


def test_analyze_arff_sparse_all_missing():
arff_data = """@RELATION sparse
@ATTRIBUTE a NUMERIC
@DATA
{0 ?}
?
{}
"""
# row 0: ? -> missing
# row 1: ? -> missing
# row 2: {} -> index 0 is missing from dict -> default (0) -> NOT missing
features = analyze_arff(StringIO(arff_data))
assert features[0].number_of_missing_values == 2


def test_analyze_arff_metadata():
arff_data = """@RELATION test
@ATTRIBUTE a NUMERIC
@ATTRIBUTE b NUMERIC
@ATTRIBUTE c NUMERIC
@DATA
1,2,3
"""
features = analyze_arff(
StringIO(arff_data), target_features=["c"], ignore_features=["b"], row_id_features=["a"]
)
assert features[0].is_row_identifier is True
assert features[1].is_ignore is True
assert features[2].is_target is True
assert features[0].is_target is False


def test_analyze_parquet():
data = [
pa.array([1, 2, None]),
pa.array(["cat", "dog", "cat"]),
pa.array([True, False, None]),
pa.array(["v1", "v2", "v3"], type=pa.dictionary(pa.int8(), pa.string())),
]
schema = pa.schema(
[
("f1", pa.int64()),
("f2", pa.string()),
("f3", pa.bool_()),
("f4", pa.dictionary(pa.int8(), pa.string())),
]
)
table = pa.Table.from_arrays(data, schema=schema)

buf = BytesIO()
pq.write_table(table, buf)
buf.seek(0)

features = analyze_parquet(buf, target_features=["f3"])

assert len(features) == 4

assert features[0].name == "f1"
assert features[0].data_type == FeatureType.NUMERIC
assert features[0].number_of_missing_values == 1

assert features[1].name == "f2"
assert features[1].data_type == FeatureType.NOMINAL
assert sorted(features[1].nominal_values) == ["cat", "dog"]

assert features[2].name == "f3"
assert features[2].data_type == FeatureType.NOMINAL
assert features[2].is_target is True
assert features[2].number_of_missing_values == 1
assert features[2].nominal_values == ["false", "true"]

assert features[3].name == "f4"
assert features[3].data_type == FeatureType.NOMINAL
assert sorted(features[3].nominal_values) == ["v1", "v2", "v3"]