Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 96 additions & 18 deletions hed/tools/remodeling/operations/number_groups_op.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,36 @@
""" Implementation in progress. """
""" Number groups of rows in a dataframe based on start and stop markers. """

import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp


# TODO: This class is under development


class NumberGroupsOp(BaseOp):
""" Implementation in progress. """
""" Number groups of rows in a dataframe based on start and stop
markers.

Required remodeling parameters:
- **number_column_name** (*str*): The name of the column to add
with the group numbers.
- **source_column** (*str*): The column to check for start and
stop markers.
- **start** (*dict*): Specification for start markers.
- **values** (*list*): List of values that mark the start of
a group.
- **inclusion** (*str*): Either "include" or "exclude" to
specify whether the start marker row should be included in
the group.
- **stop** (*dict*): Specification for stop markers.
- **values** (*list*): List of values that mark the end of
a group.
- **inclusion** (*str*): Either "include" or "exclude" to
specify whether the stop marker row should be included in
the group.

Optional remodeling parameters:
- **overwrite** (*bool*): If true, overwrite an existing column
with the same name.

"""
NAME = "number_groups"

PARAMS = {
Expand Down Expand Up @@ -83,50 +106,105 @@ def __init__(self, parameters):
self.overwrite = parameters.get('overwrite', False)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Add numbers to groups of events in dataframe.
""" Add numbers to groups of rows in the events dataframe.

Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
name (str): Unique identifier for the dataframe -- often the
original file path.
sidecar (Sidecar or file-like): Only needed for HED operations.

Returns:
Dataframe: A new dataframe after processing.

"""
# check if number_column_name exists and if so, check overwrite setting
# check if number_column_name exists and if so, check overwrite
# setting
if self.number_column_name in df.columns:
if self.overwrite is False:
raise ValueError("ExistingNumberColumn",
f"Column {self.number_column_name} already exists in event file.", "")
raise ValueError(
"ExistingNumberColumn",
f"Column {self.number_column_name} already exists "
f"in event file.", "")

# check if source_column exists
if self.source_column not in df.columns:
raise ValueError("MissingSourceColumn",
f"Column {self.source_column} does not exist in event file {name}.", "")
raise ValueError(
"MissingSourceColumn",
f"Column {self.source_column} does not exist in event "
f"file {name}.", "")

# check if all elements in value lists start and stop exist in the source_column
# check if all elements in value lists start and stop exist in
# the source_column
missing = []
for element in self.start['values']:
if element not in df[self.source_column].tolist():
missing.append(element)
if len(missing) > 0:
raise ValueError("MissingValue",
f"Start value(s) {missing} does not exist in {self.source_column} of event file {name}")
raise ValueError(
"MissingValue",
f"Start value(s) {missing} does not exist in "
f"{self.source_column} of event file {name}")

missing = []
for element in self.stop['values']:
if element not in df[self.source_column].tolist():
missing.append(element)
if len(missing) > 0:
raise ValueError("MissingValue",
f"Start value(s) {missing} does not exist in {self.source_column} of event file {name}")
raise ValueError(
"MissingValue",
f"Start value(s) {missing} does not exist in "
f"{self.source_column} of event file {name}")

df_new = df.copy()
df_new[self.number_column_name] = np.nan

# Track current group number and whether we're inside a group
current_group = 0
in_group = False

for idx in range(len(df_new)):
# Use the original df to read source values in case we're
# overwriting the source column
value = df.iloc[idx][self.source_column]

# Check if this is a start marker
if value in self.start['values']:
if not in_group: # Start a new group only if not already
# in one
current_group += 1
in_group = True
if self.start['inclusion'] == 'include':
df_new.at[idx, self.number_column_name] = \
current_group
# If already in a group and this is a start marker:
# - If inclusion is 'exclude', it acts as both end and
# start
elif self.start['inclusion'] == 'exclude':
# This marker ends the previous group and starts a
# new one
current_group += 1
# Don't assign the number to this row (it's excluded)
continue

# Check if this is a stop marker
if value in self.stop['values']:
if in_group:
if self.stop['inclusion'] == 'include':
df_new.at[idx, self.number_column_name] = \
current_group
in_group = False
continue

# Regular row - if in group, assign current group number
if in_group:
df_new.at[idx, self.number_column_name] = current_group

return df_new

@staticmethod
def validate_input_data(parameters):
""" Additional validation required of operation parameters not performed by JSON schema validator. """
""" Additional validation required of operation parameters not
performed by JSON schema validator. """
return []
69 changes: 46 additions & 23 deletions hed/tools/remodeling/operations/number_rows_op.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
""" Implementation in progress. """
""" Number rows in a dataframe based on optional criteria. """

import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp

# TODO: This class is under development


class NumberRowsOp(BaseOp):
""" Implementation in progress. """
""" Number rows in a dataframe based on optional criteria.

Required remodeling parameters:
- **number_column_name** (*str*): The name of the column to add
with the row numbers.

Optional remodeling parameters:
- **overwrite** (*bool*): If true, overwrite an existing column
with the same name.
- **match_value** (*dict*): If provided, only number rows where
the specified column matches the specified value.
- **column** (*str*): The column name to match.
- **value** (*str* or *number*): The value to match.

"""
NAME = "number_rows"

PARAMS = {
Expand Down Expand Up @@ -51,44 +64,54 @@ def __init__(self, parameters):
self.match_value = parameters.get('match_value', False)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Add numbers events dataframe.
""" Add numbers to rows in the events dataframe.

Parameters:
dispatcher (Dispatcher): Manages operation I/O.
df (DataFrame): - The DataFrame to be remodeled.
name (str): - Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Only needed for HED operations.
name (str): - Unique identifier for the dataframe -- often
the original file path.
sidecar (Sidecar or file-like): Only needed for HED operations.

Returns:
Dataframe: A new dataframe after processing.

"""
if self.number_column_name in df.columns:
if self.overwrite is False:
raise ValueError("ExistingNumberColumn",
f"Column {self.number_column_name} already exists in event file.", "")
raise ValueError(
"ExistingNumberColumn",
f"Column {self.number_column_name} already exists "
f"in event file.", "")

if self.match_value:
if self.match_value['column'] not in df.columns:
raise ValueError("MissingMatchColumn",
f"Column {self.match_value['column']} does not exist in event file.", "")
if self.match_value['value'] not in df[self.match_value['column']].tolist():
raise ValueError("MissingMatchValue",
f"Value {self.match_value['value']} does not exist in event file column"
f"{self.match_value['column']}.", "")
raise ValueError(
"MissingMatchColumn",
f"Column {self.match_value['column']} does not "
f"exist in event file.", "")
if self.match_value['value'] not in \
df[self.match_value['column']].tolist():
raise ValueError(
"MissingMatchValue",
f"Value {self.match_value['value']} does not exist "
f"in event file column "
f"{self.match_value['column']}.", "")

df_new = df.copy()
# df_new[self.number_column_name] = np.nan
# if self.match_value:
# filter = df[self.match_value['column']] == self.match_value['value']
# numbers = [*range(1, sum(filter)+1)]
# df_new.loc[filter, self.number_column_name] = numbers
# else:
# df_new[self.number_column_name] = df_new.index + 1
df_new[self.number_column_name] = np.nan
if self.match_value:
filter_mask = \
df[self.match_value['column']] == self.match_value['value']
numbers = [*range(1, sum(filter_mask)+1)]
df_new.loc[filter_mask, self.number_column_name] = numbers
else:
df_new[self.number_column_name] = df_new.index + 1

return df_new

@staticmethod
def validate_input_data(parameters):
""" Additional validation required of operation parameters not performed by JSON schema validator. """
""" Additional validation required of operation parameters not
performed by JSON schema validator. """
return []
106 changes: 56 additions & 50 deletions tests/tools/remodeling/operations/test_number_groups.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from copy import deepcopy
import json
import numpy as np
import pandas as pd
import unittest
from hed.tools.remodeling.operations.number_groups_op import NumberGroupsOp

Expand Down Expand Up @@ -151,54 +153,58 @@ def tearDownClass(cls):

# test working
def test_number_groups_new_column(self):
pass
# Test when new column name is given with overwrite unspecified (=False)
# parms = json.loads(self.json_parms)
# op = NumberGroupsOp(parms)
# df = pd.DataFrame(self.sample_data, columns=self.sample_columns)
# df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns)
# df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns)
# df_new = op.do_op(self.dispatcher, df_test, self.file_name)
#
# self.assertTrue(list(df_new.columns) == list(self.numbered_columns),
# "numbered_events should have the expected columns")
# self.assertTrue(len(df_new) == len(df_test),
# "numbered_events should have same length as original dataframe")
# self.assertTrue(np.nanmax(df_new["number"]) == 5.0,
# "max value in numbered_events should match the number of groups")
#
# # fill na to match postprocessing dispatcher
# df_new = df_new.fillna('n/a')
# self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()),
# "numbered_events should not differ from check")
#
# # Test that df has not been changed by the op
# self.assertTrue(list(df.columns) == list(df_test.columns),
# "number_rows should not change the input df columns")
# self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()),
# "number_rows should not change the input df values")
#
# def test_existing_column_overwrite_true(self):
# # Test when existing column name is given with overwrite True
# parms = json.loads(self.json_overwrite_true_parms)
# op = NumberGroupsOp(parms)
# df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns)
# df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns)
# df_check = pd.DataFrame(self.overwritten_data, columns=self.existing_sample_columns)
# df_new = op.do_op(self.dispatcher, df_test, self.file_name)
#
# self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns),
# "numbered_events should have the same columns as original dataframe in case of overwrite")
# self.assertTrue(len(df_new) == len(df_test),
# "numbered_events should have same length as original dataframe")
# self.assertTrue(np.nanmax(df_new["number"]) == 5.0,
# "max value in numbered_events should match the number of groups")
# df_new = df_new.fillna('n/a')
# self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()),
# "numbered_events should not differ from check")
#
# # Test that df has not been changed by the op
# self.assertTrue(list(df.columns) == list(df_test.columns),
# "split_rows should not change the input df columns")
# self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()),
# "split_rows should not change the input df values")
parms = json.loads(self.json_parms)
op = NumberGroupsOp(parms)
df = pd.DataFrame(self.sample_data, columns=self.sample_columns)
df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns)
df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns)
df_new = op.do_op(self.dispatcher, df_test, self.file_name)

self.assertTrue(list(df_new.columns) == list(self.numbered_columns),
"numbered_events should have the expected columns")
self.assertTrue(len(df_new) == len(df_test),
"numbered_events should have same length as original dataframe")
self.assertTrue(np.nanmax(df_new["number"]) == 5.0,
"max value in numbered_events should match the number of groups")

# fill na to match postprocessing dispatcher
df_new = df_new.fillna('n/a')
self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()),
"numbered_events should not differ from check")

# Test that df has not been changed by the op
self.assertTrue(list(df.columns) == list(df_test.columns),
"number_rows should not change the input df columns")
self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()),
"number_rows should not change the input df values")

def test_existing_column_overwrite_true(self):
# Test when existing column name is given with overwrite True
parms = json.loads(self.json_overwrite_true_parms)
op = NumberGroupsOp(parms)
df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns)
df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns)
df_check = pd.DataFrame(self.overwritten_data, columns=self.existing_sample_columns)
df_new = op.do_op(self.dispatcher, df_test, self.file_name)

self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns),
"numbered_events should have the same columns as original dataframe in case of overwrite")
self.assertTrue(len(df_new) == len(df_test),
"numbered_events should have same length as original dataframe")
self.assertTrue(np.nanmax(df_new["number"]) == 5.0,
"max value in numbered_events should match the number of groups")
df_new = df_new.fillna('n/a')
self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()),
"numbered_events should not differ from check")

# Test that df has not been changed by the op
self.assertTrue(list(df.columns) == list(df_test.columns),
"split_rows should not change the input df columns")
self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()),
"split_rows should not change the input df values")


if __name__ == '__main__':
unittest.main()

Loading