From 206023a22b385710c5ced1adb7c766b8ac0d1fe9 Mon Sep 17 00:00:00 2001 From: mikeAdamss Date: Wed, 28 Apr 2021 14:40:43 +0100 Subject: [PATCH 1/2] move this along a bit --- databaker/jupybakeutils.py | 8 +++--- databaker/lookupengines/closest.py | 8 +++--- databaker/lookupengines/directly.py | 8 +++--- databaker/lookupengines/generic.py | 38 ++++++++++++++++++++++++++--- databaker/lookupengines/within.py | 8 +++--- features/steps/load_xls.py | 22 +++++++++++++++++ 6 files changed, 75 insertions(+), 17 deletions(-) diff --git a/databaker/jupybakeutils.py b/databaker/jupybakeutils.py index 1c7ff8f..64fca04 100644 --- a/databaker/jupybakeutils.py +++ b/databaker/jupybakeutils.py @@ -33,7 +33,7 @@ def svalue(cell): class HDim: "Dimension object which defines the lookup between an observation cell and a bag of header cells" - def __init__(self, hbagset, label, engine=None, direction=None, cellvalueoverride=None): + def __init__(self, hbagset, label, engine=None, direction=None, cellvalueoverride=None, apply=None): self.label = label self.name = label self.hbagset = hbagset @@ -46,11 +46,11 @@ def __init__(self, hbagset, label, engine=None, direction=None, cellvalueoverrid # TODO: better, must be a cleaner way. if isinstance(engine, WITHIN): starting_offset, ending_offset, direction_of_travel = engine.unpack() - self.engine = WithinEngine(hbagset, direction, label, starting_offset, ending_offset, direction_of_travel, self.cellvalueoverride) + self.engine = WithinEngine(hbagset, direction, label, starting_offset, ending_offset, direction_of_travel, self.cellvalueoverride, apply) elif engine.__name__ is DirectlyEngine.__name__: - self.engine = DirectlyEngine(hbagset, direction, label, self.cellvalueoverride) + self.engine = DirectlyEngine(hbagset, direction, label, self.cellvalueoverride, apply) elif engine.__name__ is ClosestEngine.__name__: - self.engine = ClosestEngine(hbagset, direction, label, self.cellvalueoverride) + self.engine = ClosestEngine(hbagset, direction, label, self.cellvalueoverride, apply) elif engine.__name__ is ConstantEngine.__name__: self.engine = ConstantEngine(self.cellvalueoverride) else: diff --git a/databaker/lookupengines/closest.py b/databaker/lookupengines/closest.py index db81ed5..5405d3a 100644 --- a/databaker/lookupengines/closest.py +++ b/databaker/lookupengines/closest.py @@ -1,7 +1,7 @@ import json from databaker.constants import ABOVE, BELOW, LEFT, RIGHT, DIRECTION_DICT -from databaker.lookupengines.generic import override_looked_up_cell +from databaker.lookupengines.generic import override_looked_up_cell, unpack_callables class BoundaryError(Exception): """ Raised when attempting to lookup outside the bounds of where a lookup can exist""" @@ -10,7 +10,7 @@ def __init__(self, message): class ClosestEngine(object): - def __init__(self, cell_bag, direction, label, cellvalueoverride): + def __init__(self, cell_bag, direction, label, cellvalueoverride, apply): """ Creates a lookup engine for dimensions defined with the CLOSEST relationship. @@ -48,6 +48,7 @@ def __init__(self, cell_bag, direction, label, cellvalueoverride): self.direction = direction self.label = label self.cellvalueoverride = cellvalueoverride if cellvalueoverride is not None else {} + self.apply_functions = unpack_callables(apply) assert len(cell_bag) > 0, f'Aborting. The dimension {self.label} is defined as CLOSEST ' \ + f'{DIRECTION_DICT[self.direction]} but an empty selection of cells has been ' \ @@ -237,5 +238,6 @@ def lookup(self, cell, index=None, ceiling=None, floor=0): # Apply str level cell value override if applicable - cell, cell_value = override_looked_up_cell(r["dimension_cell"], self.cellvalueoverride) + cell, cell_value = override_looked_up_cell(r["dimension_cell"], self.cellvalueoverride, + self.apply_functions) return cell, cell_value \ No newline at end of file diff --git a/databaker/lookupengines/directly.py b/databaker/lookupengines/directly.py index 34bea57..f54ea9d 100644 --- a/databaker/lookupengines/directly.py +++ b/databaker/lookupengines/directly.py @@ -1,5 +1,5 @@ from databaker.constants import ABOVE, BELOW, LEFT, RIGHT, DIRECTION_DICT -from databaker.lookupengines.generic import override_looked_up_cell +from databaker.lookupengines.generic import override_looked_up_cell, unpack_callables class DirectLookupException(Exception): """Raised when a DIRECT lookup fails""" @@ -9,7 +9,7 @@ def __init__(self, message): class DirectlyEngine(object): - def __init__(self, cell_bag, direction, label, cellvalueoverride): + def __init__(self, cell_bag, direction, label, cellvalueoverride, apply): """ We're going to write the cell_bag into a tiered dictionary (you could use a flat dictionary, but this'll be quicker). @@ -38,6 +38,7 @@ def __init__(self, cell_bag, direction, label, cellvalueoverride): self.direction = direction self.label = label self.cellvalueoverride = cellvalueoverride if cellvalueoverride is not None else {} + self.apply_functions = unpack_callables(apply) self.tiered_dict = {} @@ -141,7 +142,8 @@ def lookup(self, cell): " x:{}, y{}.".format(self.direction,cell.x, cell.y)) if self.last_cell_found is not None: - cell, cell_value = override_looked_up_cell(self.last_cell_found, self.cellvalueoverride) + cell, cell_value = override_looked_up_cell(self.last_cell_found, self.cellvalueoverride, + self.apply_functions) return cell, cell_value # If we fall through to here the lookup has failed, raise an exception diff --git a/databaker/lookupengines/generic.py b/databaker/lookupengines/generic.py index 715b96b..ee49dd9 100644 --- a/databaker/lookupengines/generic.py +++ b/databaker/lookupengines/generic.py @@ -1,6 +1,6 @@ import logging -def override_looked_up_cell(found_cell, cellvalueoverride): +def override_looked_up_cell(found_cell, cellvalueoverride, apply_functions): str_override_applied = False cell_override_applied = False @@ -22,5 +22,37 @@ def override_looked_up_cell(found_cell, cellvalueoverride): logging.warning(f'''Both a str (cellvalueoverride) and cell level (AddCellValueOverride) override is being applied to cell {found_cell}. This should rarely or never be necessary and can lead to confusing and difficult to debug behaviour.''') - - return found_cell, value \ No newline at end of file + + for a_func in apply_functions: + if str_override_applied or cell_override_applied: + logging.warning(f'''You are applying (via apply=) a function to the dimension + header {found_cell} that has already had a cell value override applied to it. + The cell.value at time this function was applied was {value}.''') + value = a_func(value) + + return found_cell, value + +def unpack_callables(callable_thing_or_tuple_of): + """ + Where we are passing callables to the HDim apply= keyword, if not none sanity check + what we've got and make sure we return a tuple of them. + """ + if not callable_thing_or_tuple_of: + return () + + msg = '{} is being passed in via apply= but does not appear to be callable or tuple of callables' + + # If it's a single callable, return it in a tuple + if not isinstance(callable_thing_or_tuple_of, tuple): + assert hasattr(callable_thing_or_tuple_of, "__call__"), msg.format(callable_thing_or_tuple_of) + return tuple([callable_thing_or_tuple_of]) + + # If it's already a tuple, make sure everything in it is callable + elif isinstance(callable_thing_or_tuple_of, tuple): + for a_hopefully_callable_thing in callable_thing_or_tuple_of: + assert hasattr(a_hopefully_callable_thing, "__call__"), msg.format(a_hopefully_callable_thing) + return callable_thing_or_tuple_of + + else: + raise ValueError('''The argument to HDim(apply=) should either be single callable (eg a function, + lambda, or a class with a .__call__ method or a tuple of these things.''') diff --git a/databaker/lookupengines/within.py b/databaker/lookupengines/within.py index c485ef9..567b8f9 100644 --- a/databaker/lookupengines/within.py +++ b/databaker/lookupengines/within.py @@ -1,6 +1,6 @@ from databaker.constants import ABOVE, BELOW, UP, DOWN, LEFT, RIGHT, DIRECTION_DICT -from databaker.lookupengines.generic import override_looked_up_cell +from databaker.lookupengines.generic import override_looked_up_cell, unpack_callables # Essentially a factory function for the actual WithinEngine # I don't particularly like breaking the python convention of UPPERCLASS == a constant @@ -88,8 +88,7 @@ def unpack(self): class WithinEngine(object): - def __init__(self, cell_bag, direction, label, starting_offset, ending_offset, direction_of_travel, cellvalueoverride): - self.cellvalueoverride = cellvalueoverride + def __init__(self, cell_bag, direction, label, starting_offset, ending_offset, direction_of_travel, cellvalueoverride, apply): """ Creates a lookup engine to resolve a WITHIN() lookup. @@ -131,6 +130,7 @@ def __init__(self, cell_bag, direction, label, starting_offset, ending_offset, d self.direction_of_travel = direction_of_travel self.cellvalueoverride = cellvalueoverride self.cell_bag = cell_bag + self.apply_functions = unpack_callables(apply) self.sequence = self._sequencer(cell_bag) # see docstring @@ -332,5 +332,5 @@ def lookup(self, cell): raise ValueError(f'Unsuccessful within lookup for cell {cell} in dimension "{self.label}". Direction was {DIRECTION_DICT[self.direction]}' f' and we were scanning {DIRECTION_DICT[self.direction_of_travel]} but no header cell was found in the specified range.') - cell, cell_value = override_looked_up_cell(found_cell, self.cellvalueoverride) + cell, cell_value = override_looked_up_cell(found_cell, self.cellvalueoverride, self.apply_functions) return cell, cell_value \ No newline at end of file diff --git a/features/steps/load_xls.py b/features/steps/load_xls.py index 9ba8382..6811549 100644 --- a/features/steps/load_xls.py +++ b/features/steps/load_xls.py @@ -327,11 +327,33 @@ def step_impl(context, ob_cell_excel_ref, dimension_name, expecting): looked_up_cell, _ = dimension.celllookup(ob_cell) assert str(looked_up_cell) == expecting, f'Got {str(looked_up_cell)}, expected {expecting}' +@then('the lookup from an observation in cell "{ob_cell_excel_ref}" to the dimension "{dimension_name}" returns the value "{expecting}"') +def step_impl(context, ob_cell_excel_ref, dimension_name, expecting): + dimension = [x for x in context.dimensions if x.name == dimension_name] + assert len(dimension) == 1, f'Could not find a dimension named {dimension_name}' + dimension = dimension[0] + + observation_selection = [context.selections[x] for x in context.selections if x == "observations"][0] + ob_cell = None + for cell in observation_selection: + if xypath.contrib.excel.excel_location(cell) == ob_cell_excel_ref: + ob_cell = cell + break + else: + raise ValueError(f'Could not find a selected observation cell for excel reference {ob_cell_excel_ref}') + + assert ob_cell is not None + + _, obs_value = dimension.celllookup(ob_cell) + assert obs_value == expecting, f'Got "{obs_value}", expected "{expecting}"' @then('it throws an error of type "{err_type}"') def step(context, err_type): assert type(context.exc) == eval(err_type), f'Unexpected error type. Expected: "{type(context.exc)}". Got: "{eval(err_type)}".' +@then('it throws an error containing the text "{err_text}"') +def step(context, err_text): + assert err_text in str(context.exc), f'Expected error text "{err_text}", not found in {str(context.exc)}".' @then(u'we are given the exception message') def step_impl(context): From a364707cadfc91ecaf052f0bf4963d097ecd83f5 Mon Sep 17 00:00:00 2001 From: mikeAdamss Date: Wed, 28 Apr 2021 16:33:44 +0100 Subject: [PATCH 2/2] forgot feature file --- features/applycallable.feature | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 features/applycallable.feature diff --git a/features/applycallable.feature b/features/applycallable.feature new file mode 100644 index 0000000..f1d7637 --- /dev/null +++ b/features/applycallable.feature @@ -0,0 +1,82 @@ +Feature: Apply callables passed in at runtime to a dimension constructor + As a data enginner. When I construct a databaker dimension, I want to be able to pass in a function, lambda or + other callable that acts upon any values returned by a lookup to a dimension. + + Scenario Outline: Apply a callable to a dimension constructor for a CLOSEST engine + Given we load a file named + And select the sheet "Sheet1" + And we define cell selections as + | key | value | + | month | tab.excel_ref("B6:B25").is_not_blank() | + | observations | tab.excel_ref("C6:I25") | + And we define the dimensions as + """ + HDim(month, "Month", CLOSEST, ABOVE, apply=lambda x: f"I am the month: {x}") + """ + Then the lookup from an observation in cell "C8" to the dimension "Month" returns the value "I am the month: Jan" + And the lookup from an observation in cell "C13" to the dimension "Month" returns the value "I am the month: Apr" + And the lookup from an observation in cell "C18" to the dimension "Month" returns the value "I am the month: Jul" + And the lookup from an observation in cell "C23" to the dimension "Month" returns the value "I am the month: Oct" + + Examples: File Types + | File Name | + | "bakingtestdataset.xls" | + | "bakingtestdataset.xlsx" | + + Scenario Outline: Apply funa callable to a dimension constructor for a DIRECT engine + Given we load a file named + And select the sheet "Sheet1" + And we define cell selections as + | key | value | + | top_dims | tab.excel_ref("D5:I5") | + | observations | tab.excel_ref("C6:I25") | + And we define the dimensions as + """ + HDim(top_dims, "Top Dims", DIRECTLY, ABOVE, apply=lambda x: f'{x} got this text added.') + """ + Then the lookup from an observation in cell "D8" to the dimension "Top Dims" returns the value "Dim 1 got this text added." + And the lookup from an observation in cell "E8" to the dimension "Top Dims" returns the value "Dim 2 got this text added." + And the lookup from an observation in cell "F8" to the dimension "Top Dims" returns the value "Dim 3 got this text added." + And the lookup from an observation in cell "G8" to the dimension "Top Dims" returns the value "Dim 4 got this text added." + + Examples: File Types + | File Name | + | "bakingtestdataset.xls" | + | "bakingtestdataset.xlsx" | + + Given we load a file named "bakingtestdataset.xls" + And select the sheet "Sheet1" + And we define cell selections as + | key | value | + | cats_and_dogs | tab.excel_ref("3").is_not_blank() | + | observations | tab.excel_ref("C6:I25") | + And we define the dimensions as + """ + HDim(cats_and_dogs, "Cats And Dogs", WITHIN(right=2, left=1), ABOVE, apply=lambda x: f'I love {x} a whole bunch') + """ + Then the lookup from an observation in cell "C25" to the dimension "Cats And Dogs" returns the value "I love Cats a whole bunch" + And the lookup from an observation in cell "F25" to the dimension "Cats And Dogs" returns the value "I love Cats a whole bunch" + And the lookup from an observation in cell "G25" to the dimension "Cats And Dogs" returns the value "I love Dogs a whole bunch" + And the lookup from an observation in cell "I25" to the dimension "Cats And Dogs" returns the value "I love Dogs a whole bunch" + + + Scenario Outline: Apply multiple ordered callables to a dimension constructor for a DIRECT engine + Given we load a file named + And select the sheet "Sheet1" + And we define cell selections as + | key | value | + | top_dims | tab.excel_ref("D5:I5") | + | observations | tab.excel_ref("C6:I25") | + And we define the dimensions as + """ + HDim(top_dims, "Top Dims", DIRECTLY, ABOVE, apply=(lambda x: f'{x} got this text added.', lambda x: x.replace(' got this', ''))) + """ + Then the lookup from an observation in cell "D8" to the dimension "Top Dims" returns the value "Dim 1 text added." + And the lookup from an observation in cell "E8" to the dimension "Top Dims" returns the value "Dim 2 text added." + And the lookup from an observation in cell "F8" to the dimension "Top Dims" returns the value "Dim 3 text added." + And the lookup from an observation in cell "G8" to the dimension "Top Dims" returns the value "Dim 4 text added." + + Examples: File Types + | File Name | + | "bakingtestdataset.xls" | + | "bakingtestdataset.xlsx" |