-
-
Notifications
You must be signed in to change notification settings - Fork 46
Build feature analyzer in Python #248
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| from collections.abc import Iterable | ||
| from typing import IO | ||
|
|
||
| import arff | ||
| import pyarrow as pa | ||
| import pyarrow.parquet as pq | ||
|
|
||
| from schemas.datasets.openml import Feature, FeatureType | ||
|
|
||
|
|
||
| def analyze_arff( | ||
| arff_stream: IO[str], | ||
| target_features: Iterable[str] | None = None, | ||
| ignore_features: Iterable[str] | None = None, | ||
| row_id_features: Iterable[str] | None = None, | ||
| ) -> list[Feature]: | ||
| """Analyze an ARFF file and return a list of Feature objects.""" | ||
| dataset = arff.load(arff_stream) | ||
| attributes = dataset["attributes"] | ||
| data = dataset["data"] | ||
|
|
||
| target_features = set(target_features or []) | ||
| ignore_features = set(ignore_features or []) | ||
| row_id_features = set(row_id_features or []) | ||
|
|
||
| features = [] | ||
| for i, (name, type_info) in enumerate(attributes): | ||
| if isinstance(type_info, list): | ||
| data_type = FeatureType.NOMINAL | ||
| nominal_values = type_info | ||
| elif type_info.upper() in ("NUMERIC", "REAL", "INTEGER"): | ||
| data_type = FeatureType.NUMERIC | ||
| nominal_values = None | ||
| elif type_info.upper() == "STRING": | ||
| data_type = FeatureType.STRING | ||
| nominal_values = None | ||
| else: | ||
| # Fallback or handle other types if necessary | ||
| data_type = FeatureType.STRING | ||
| nominal_values = None | ||
|
|
||
| # Count missing values | ||
| missing_count = 0 | ||
| if data: | ||
| for row in data: | ||
| # In liac-arff, data can be a list of lists or a list of dictionaries (for sparse) | ||
| if isinstance(row, dict): | ||
| # Sparse format: only present values are in the dict | ||
| if i not in row: | ||
| # In sparse ARFF, if an index is missing from the dict, | ||
| # it means it has the default value, which is usually 0. | ||
| # sparse ARFF in liac-arff: | ||
| # {index: value, ...} | ||
| # If it's missing, it's 0. | ||
| # Missing values are represented as None in the dict if explicitly present. | ||
| # OpenML's sparse ARFF uses {index value, ...} | ||
| # and missing values are simply not there if they are 0, | ||
| # but if they are really missing they should be there as '?' | ||
| # which liac-arff converts to None. | ||
| pass | ||
| elif row[i] is None: | ||
| missing_count += 1 | ||
| elif row[i] is None: | ||
| missing_count += 1 | ||
|
|
||
| features.append( | ||
| Feature( | ||
| index=i, | ||
| name=name, | ||
| data_type=data_type, | ||
| is_target=name in target_features, | ||
| is_ignore=name in ignore_features, | ||
| is_row_identifier=name in row_id_features, | ||
| number_of_missing_values=missing_count, | ||
| nominal_values=nominal_values, | ||
| ), | ||
| ) | ||
| return features | ||
|
|
||
|
|
||
| def analyze_parquet( | ||
| source: str | IO[bytes], | ||
| target_features: Iterable[str] | None = None, | ||
| ignore_features: Iterable[str] | None = None, | ||
| row_id_features: Iterable[str] | None = None, | ||
| ) -> list[Feature]: | ||
| """Analyze a Parquet file and return a list of Feature objects.""" | ||
| table = pq.read_table(source) | ||
| schema = table.schema | ||
|
|
||
| target_features = set(target_features or []) | ||
| ignore_features = set(ignore_features or []) | ||
| row_id_features = set(row_id_features or []) | ||
|
|
||
| features = [] | ||
| for i, field in enumerate(schema): | ||
| name = field.name | ||
| pa_type = field.type | ||
|
|
||
| # Determine data_type and nominal_values | ||
| nominal_values = None | ||
| if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): | ||
| data_type = FeatureType.NUMERIC | ||
| elif pa.types.is_dictionary(pa_type): | ||
| data_type = FeatureType.NOMINAL | ||
| # Extract nominal values from dictionary | ||
| # We need to look at the data to get the dictionary values | ||
| column_data = table.column(i) | ||
| # A column can have multiple chunks | ||
| unique_values = set() | ||
| for chunk in column_data.chunks: | ||
| dictionary = chunk.dictionary | ||
| for val in dictionary: | ||
| unique_values.add(val.as_py()) | ||
| nominal_values = sorted(list(unique_values)) | ||
|
Comment on lines
+108
to
+115
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dictionary entries may include values not present in the data.
🤖 Prompt for AI Agents |
||
| elif pa.types.is_string(pa_type) or pa.types.is_boolean(pa_type): | ||
| # For Parquet, strings might be nominal if they don't have a dictionary | ||
| # We needed to "Extract unique values from the data" for nominals in non-ARFF | ||
| # In OpenML, if it's used for classification, it's nominal. | ||
| # If we don't know, we might have to guess or treat all strings as nominal if they have | ||
| # few unique values. | ||
|
|
||
| # For Parquet, let's assume if it's not numeric, it's nominal for now, | ||
| # as that's common in ML datasets, unless it's explicitly string. | ||
|
|
||
| # If it's boolean, it's definitely nominal [False, True]. | ||
| if pa.types.is_boolean(pa_type): | ||
| data_type = FeatureType.NOMINAL | ||
| nominal_values = ["false", "true"] | ||
| else: | ||
| # For string, let's extract unique values and see. | ||
| column_data = table.column(i) | ||
| unique_values = set() | ||
| # For efficiency, we might not want to scan everything if it's huge | ||
| for chunk in column_data.chunks: | ||
| for val in chunk.unique(): | ||
| v = val.as_py() | ||
| if v is not None: | ||
| unique_values.add(str(v)) | ||
|
|
||
| # OpenML usually has a threshold, but let's just call it nominal if it's string | ||
| data_type = FeatureType.NOMINAL | ||
| nominal_values = sorted(list(unique_values)) | ||
|
Comment on lines
+130
to
+143
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unbounded unique-value scan on string columns — potential performance/memory issue on high-cardinality data. For string columns, every value across all chunks is scanned into a Consider adding a cardinality threshold: if unique values exceed a limit (e.g., 256), classify as 🔧 Sketch of a threshold-based approach for chunk in column_data.chunks:
for val in chunk.unique():
v = val.as_py()
if v is not None:
unique_values.add(str(v))
+ if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+ break
+ if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+ break
- data_type = FeatureType.NOMINAL
- nominal_values = sorted(unique_values)
+ if len(unique_values) > MAX_NOMINAL_CARDINALITY:
+ data_type = FeatureType.STRING
+ nominal_values = None
+ else:
+ data_type = FeatureType.NOMINAL
+ nominal_values = sorted(unique_values)🤖 Prompt for AI Agents |
||
| else: | ||
| data_type = FeatureType.STRING | ||
| nominal_values = None | ||
|
Comment on lines
+100
to
+146
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing handling for Parquet files commonly contain decimal and temporal types. Currently these fall through to the 🔧 Proposed fix for decimal types- if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
+ if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type) or pa.types.is_decimal(pa_type):
data_type = FeatureType.NUMERICFor temporal types ( 🤖 Prompt for AI Agents |
||
|
|
||
| # Count missing values | ||
| missing_count = table.column(i).null_count | ||
|
|
||
| features.append( | ||
| Feature( | ||
| index=i, | ||
| name=name, | ||
| data_type=data_type, | ||
| is_target=name in target_features, | ||
| is_ignore=name in ignore_features, | ||
| is_row_identifier=name in row_id_features, | ||
| number_of_missing_values=missing_count, | ||
| nominal_values=nominal_values, | ||
| ), | ||
| ) | ||
| return features | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| from io import BytesIO, StringIO | ||
|
|
||
| import pyarrow as pa | ||
| import pyarrow.parquet as pq | ||
|
|
||
| from core.feature_analysis import analyze_arff, analyze_parquet | ||
| from schemas.datasets.openml import FeatureType | ||
|
|
||
|
|
||
| def test_analyze_arff_dense(): | ||
| arff_data = """@RELATION test | ||
| @ATTRIBUTE a NUMERIC | ||
| @ATTRIBUTE b {x,y} | ||
| @ATTRIBUTE c STRING | ||
| @DATA | ||
| 1,x,hello | ||
| 2,y,world | ||
| ?,? ,? | ||
| """ | ||
|
Comment on lines
+7
to
+16
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Add a test for ARFF files with a schema but no data rows Please add coverage for an ARFF file that declares attributes but has an empty |
||
| features = analyze_arff(StringIO(arff_data)) | ||
| assert len(features) == 3 | ||
|
|
||
| assert features[0].name == "a" | ||
| assert features[0].data_type == FeatureType.NUMERIC | ||
| assert features[0].number_of_missing_values == 1 | ||
| assert features[0].nominal_values is None | ||
|
|
||
| assert features[1].name == "b" | ||
| assert features[1].data_type == FeatureType.NOMINAL | ||
| assert features[1].nominal_values == ["x", "y"] | ||
| assert features[1].number_of_missing_values == 1 | ||
|
|
||
| assert features[2].name == "c" | ||
| assert features[2].data_type == FeatureType.STRING | ||
| assert features[2].number_of_missing_values == 1 | ||
|
|
||
|
|
||
| def test_analyze_arff_sparse(): | ||
| arff_data = """@RELATION test | ||
| @ATTRIBUTE a NUMERIC | ||
| @ATTRIBUTE b NUMERIC | ||
| @ATTRIBUTE c {X,Y} | ||
| @DATA | ||
| {0 1, 2 X} | ||
| {1 5} | ||
| {0 ?, 2 ?} | ||
| """ | ||
| features = analyze_arff(StringIO(arff_data)) | ||
| assert len(features) == 3 | ||
|
|
||
| # index 0: 1, missing(0), ? -> 1 missing | ||
| assert features[0].name == "a" | ||
| assert features[0].number_of_missing_values == 1 | ||
|
|
||
| # index 1: missing(0), 5, missing(0) -> 0 missing | ||
| assert features[1].name == "b" | ||
| assert features[1].number_of_missing_values == 0 | ||
|
|
||
| # index 2: X, missing(None?), ? | ||
| # Row 1: {1 5} -> index 2 is missing. In sparse ARFF, if it's missing it's the 0-th element for nominal. | ||
| assert features[2].name == "c" | ||
| assert features[2].number_of_missing_values == 1 # Only from row 2 {0 ?, 2 ?} | ||
|
|
||
|
|
||
| def test_analyze_arff_sparse_all_missing(): | ||
| arff_data = """@RELATION sparse | ||
| @ATTRIBUTE a NUMERIC | ||
| @DATA | ||
| {0 ?} | ||
| ? | ||
| {} | ||
| """ | ||
| # row 0: ? -> missing | ||
| # row 1: ? -> missing | ||
| # row 2: {} -> index 0 is missing from dict -> default (0) -> NOT missing | ||
| features = analyze_arff(StringIO(arff_data)) | ||
| assert features[0].number_of_missing_values == 2 | ||
|
|
||
|
|
||
| def test_analyze_arff_metadata(): | ||
| arff_data = """@RELATION test | ||
| @ATTRIBUTE a NUMERIC | ||
| @ATTRIBUTE b NUMERIC | ||
| @ATTRIBUTE c NUMERIC | ||
| @DATA | ||
| 1,2,3 | ||
| """ | ||
| features = analyze_arff( | ||
| StringIO(arff_data), target_features=["c"], ignore_features=["b"], row_id_features=["a"] | ||
| ) | ||
| assert features[0].is_row_identifier is True | ||
| assert features[1].is_ignore is True | ||
| assert features[2].is_target is True | ||
| assert features[0].is_target is False | ||
|
|
||
|
|
||
| def test_analyze_parquet(): | ||
| data = [ | ||
| pa.array([1, 2, None]), | ||
| pa.array(["cat", "dog", "cat"]), | ||
| pa.array([True, False, None]), | ||
| pa.array(["v1", "v2", "v3"], type=pa.dictionary(pa.int8(), pa.string())), | ||
| ] | ||
| schema = pa.schema( | ||
| [ | ||
| ("f1", pa.int64()), | ||
| ("f2", pa.string()), | ||
| ("f3", pa.bool_()), | ||
| ("f4", pa.dictionary(pa.int8(), pa.string())), | ||
| ] | ||
| ) | ||
| table = pa.Table.from_arrays(data, schema=schema) | ||
|
|
||
| buf = BytesIO() | ||
| pq.write_table(table, buf) | ||
| buf.seek(0) | ||
|
|
||
| features = analyze_parquet(buf, target_features=["f3"]) | ||
|
|
||
| assert len(features) == 4 | ||
|
|
||
| assert features[0].name == "f1" | ||
| assert features[0].data_type == FeatureType.NUMERIC | ||
| assert features[0].number_of_missing_values == 1 | ||
|
|
||
| assert features[1].name == "f2" | ||
| assert features[1].data_type == FeatureType.NOMINAL | ||
| assert sorted(features[1].nominal_values) == ["cat", "dog"] | ||
|
|
||
| assert features[2].name == "f3" | ||
| assert features[2].data_type == FeatureType.NOMINAL | ||
| assert features[2].is_target is True | ||
| assert features[2].number_of_missing_values == 1 | ||
| assert features[2].nominal_values == ["false", "true"] | ||
|
|
||
| assert features[3].name == "f4" | ||
| assert features[3].data_type == FeatureType.NOMINAL | ||
| assert sorted(features[3].nominal_values) == ["v1", "v2", "v3"] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion (performance): Treating all string columns as nominal and enumerating all unique values may not scale for high-cardinality text.
This branch treats all string-typed Parquet columns as nominal and builds
nominal_valuesby callingchunk.unique()over all chunks. For high-cardinality or free-text columns this can be very slow and memory-intensive, producing huge and not very usefulnominal_valueslists. Consider introducing a cardinality/uniqueness threshold or sampling limit to fall back toSTRINGinstead ofNOMINAL, and/or capping the number of unique values collected to avoid pathological cases.Suggested implementation:
attribute_type/feature_typeor similar) interpretsnominal_values is Nonefor string-typed columns as a signal to treat the column asSTRING(or equivalent non-nominal type) rather thanNOMINAL.MAX_NOMINAL_CARDINALITYandMAX_NOMINAL_SAMPLE_ROWSto that configuration instead of hard-coding them inline.