From ea89362d927052de7667d55b6a080d9d98b7fe4f Mon Sep 17 00:00:00 2001 From: Mike Fuller Date: Thu, 14 May 2026 21:45:49 +1000 Subject: [PATCH 1/3] Added new checkfunctions from 1.4 to model Bumped version Signed-off-by: Mike Fuller --- .../focus_to_duckdb_converter.py | 607 +++++++++++++++++- focus_validator/rules/spec_rules.py | 3 + pyproject.toml | 3 +- .../test_focus_to_duckdb_generators.py | 345 +++++++++- 4 files changed, 939 insertions(+), 19 deletions(-) diff --git a/focus_validator/config_objects/focus_to_duckdb_converter.py b/focus_validator/config_objects/focus_to_duckdb_converter.py index 231f0c6..b693968 100644 --- a/focus_validator/config_objects/focus_to_duckdb_converter.py +++ b/focus_validator/config_objects/focus_to_duckdb_converter.py @@ -558,6 +558,40 @@ def getCheckType(self) -> str: return "type_string" +class TypeJSONCheckGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + keyword = self._get_validation_keyword() + message = self.errorMessage or f"{col} {keyword} be of type JSON." + msg_sql = message.replace("'", "''") + + condition = f"{col} IS NOT NULL AND typeof({col}) != 'JSON'" + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + predicate_sql = f"{col} IS NOT NULL AND typeof({col}) = 'JSON'" + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql + ) + + def getCheckType(self) -> str: + return "type_json" + + class TypeDecimalCheckGenerator(DuckDBCheckGenerator): REQUIRED_KEYS = {"ColumnName"} @@ -1081,16 +1115,8 @@ def generateSql(self) -> SQLQuery: message = self.errorMessage or f"{col} {keyword} be valid JSON format" msg_sql = message.replace("'", "''") - # Requirement SQL (finds violations) - # Check if column is not null and either: - # 1. Cannot be cast to JSON, or - # 2. Is not a valid JSON string when treated as text - condition = ( - f"{col} IS NOT NULL " - f"AND (TRY_CAST({col} AS JSON) IS NULL " - f"OR (typeof({col}) = 'VARCHAR' AND NOT json_valid({col}::TEXT)))" - ) - condition = self._apply_condition(condition) + invalid_predicate = f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))" + condition = self._apply_condition(invalid_predicate) requirement_sql = f""" WITH invalid AS ( @@ -1104,11 +1130,8 @@ def generateSql(self) -> SQLQuery: FROM invalid """ - # Predicate SQL (for condition mode) - predicate_sql = ( - f"{col} IS NOT NULL " - f"AND (TRY_CAST({col} AS JSON) IS NOT NULL " - f"OR (typeof({col}) = 'VARCHAR' AND json_valid({col}::TEXT)))" + predicate_sql = self._apply_condition( + f"{col} IS NOT NULL AND json_valid(CAST({col} AS VARCHAR))" ) return SQLQuery( @@ -1119,6 +1142,165 @@ def getCheckType(self) -> str: return "format_json" +class CheckJSONSchemaGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName", "SchemaId"} + DEFAULTS = {"Path": "$"} + + def getCheckType(self) -> str: + return "json_schema" + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + schema_id = self.params.SchemaId + keyword = self._get_validation_keyword() + self.errorMessage = ( + self.errorMessage + or f"{col} {keyword} conform to JSON Schema '{schema_id}'" + ) + return SQLQuery(requirement_sql="SELECT 0 AS violations") + + def _extract_path_value(self, payload: Any, path: str) -> Any: + if path == "$": + return payload + + if not path.startswith("$."): + raise InvalidRuleException( + f"Unsupported JSON path '{path}' for CheckJSONSchema in rule {self.rule_id}" + ) + + current = payload + for segment in path[2:].split("."): + if current is None: + return None + + token = segment + while token: + array_match = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)(\[(\d+)\])?(.*)$", token) + if not array_match: + raise InvalidRuleException( + f"Unsupported JSON path segment '{segment}' for CheckJSONSchema in rule {self.rule_id}" + ) + + key_name, _, array_idx, remainder = array_match.groups() + if not isinstance(current, dict): + return None + current = current.get(key_name) + + if array_idx is not None: + if not isinstance(current, list): + return None + idx = int(array_idx) + if idx >= len(current): + return None + current = current[idx] + + token = remainder or "" + + return current + + def generateCheck(self) -> DuckDBColumnCheck: + chk = super().generateCheck() + + schema_map = getattr(self.params, "schemas", None) or {} + schema_id = self.params.SchemaId + schema_entry = schema_map.get(schema_id) + + if not isinstance(schema_entry, dict) or "Schema" not in schema_entry: + raise InvalidRuleException( + f"SchemaId '{schema_id}' referenced by rule {self.rule_id} was not found in model Schemas" + ) + + schema = schema_entry["Schema"] + path = getattr(self.params, "Path", "$") + col = self.params.ColumnName + where_clauses = [f"{col} IS NOT NULL"] + row_condition = (self.row_condition_sql or "").strip() + if row_condition: + where_clauses.append(f"({row_condition})") + + query = ( + f"SELECT {col} FROM {{table_name}} WHERE " + " AND ".join(where_clauses) + ) + + def _exec_json_schema(conn): + try: + from jsonschema import Draft202012Validator + except ModuleNotFoundError as exc: + raise RuntimeError( + "CheckJSONSchema requires the 'jsonschema' package to be installed" + ) from exc + + Draft202012Validator.check_schema(schema) + validator = Draft202012Validator(schema) + table_name = getattr(self.params, "table_name", "focus_data") + sql = query.replace("{{table_name}}", table_name) + sql = sql.replace("{table_name}", table_name) + try: + rows = conn.execute(sql).fetchall() + except (duckdb.BinderException, duckdb.CatalogException) as exc: + msg = str(exc) + missing = [] + patterns = [ + r'Column with name ([A-Za-z0-9_"]+) does not exist', + r'Referenced column "([A-Za-z0-9_]+)" not found', + r'Binder Error: .*? column ([A-Za-z0-9_"]+)', + r'"([A-Za-z0-9_]+)" not found', + ] + for pattern in patterns: + for match in re.finditer(pattern, msg): + col_name = match.group(1).strip('"') + if col_name and col_name not in missing: + missing.append(col_name) + + missing_msg = ( + f"Missing columns: {', '.join(missing)}" + if missing + else "Missing required column(s)" + ) + return False, { + "violations": 1, + "schema_id": schema_id, + "message": f"{self.errorMessage}. {missing_msg}", + "failure_reason": missing_msg, + "error_type": "missing_columns", + } + + failure_messages: list[str] = [] + violations = 0 + for row_num, row in enumerate(rows, start=1): + raw_value = row[0] if isinstance(row, (tuple, list)) else row + try: + payload = json.loads(raw_value) if isinstance(raw_value, str) else raw_value + except Exception as exc: + violations += 1 + failure_messages.append(f"row {row_num}: invalid JSON ({exc})") + continue + + instance = self._extract_path_value(payload, path) + errors = sorted(validator.iter_errors(instance), key=lambda err: list(err.path)) + if errors: + violations += 1 + failure_messages.append( + f"row {row_num}: {errors[0].message}" + ) + + ok = violations == 0 + details = { + "violations": violations, + "schema_id": schema_id, + "message": self.errorMessage + if ok + else f"{self.errorMessage}. First error: {failure_messages[0]}", + } + if failure_messages: + details["failure_messages"] = failure_messages[:5] + return ok, details + + chk.special_executor = _exec_json_schema + chk.meta["special_executor_kind"] = "json_schema" + return chk + + class CheckValueGenerator(DuckDBCheckGenerator): REQUIRED_KEYS = {"ColumnName", "Value"} @@ -1286,6 +1468,128 @@ def generatePredicate(self) -> str | None: return sql_query.get_predicate_sql() +class CheckRegexMatchGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName", "Pattern"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + pattern = self.params.Pattern + keyword = self._get_validation_keyword() + pattern_sql = str(pattern).replace("'", "''") + message = self.errorMessage or f"{col} {keyword} match regex '{pattern}'." + msg_sql = message.replace("'", "''") + + condition = ( + f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" + ) + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + predicate_sql = ( + f"{col} IS NOT NULL AND regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" + ) + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql + ) + + def get_sample_sql(self) -> str: + col = self.params.ColumnName + pattern = self.params.Pattern + pattern_sql = str(pattern).replace("'", "''") + + condition = ( + f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" + ) + condition = self._apply_condition(condition) + + return f""" + SELECT {col} + FROM {{table_name}} + WHERE {condition} + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_regex_match" + + +class CheckStringEndsWithGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName", "Value"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + value = self.params.Value + keyword = self._get_validation_keyword() + value_sql = str(value).replace("'", "''") + value_len = len(str(value)) + message = self.errorMessage or f"{col} {keyword} end with '{value}'." + msg_sql = message.replace("'", "''") + + condition = ( + f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" + ) + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + predicate_sql = ( + f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) = '{value_sql}'" + ) + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql + ) + + def get_sample_sql(self) -> str: + col = self.params.ColumnName + value = self.params.Value + value_sql = str(value).replace("'", "''") + value_len = len(str(value)) + + condition = ( + f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" + ) + condition = self._apply_condition(condition) + + return f""" + SELECT {col} + FROM {{table_name}} + WHERE {condition} + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_string_ends_with" + + class CheckSameValueGenerator(DuckDBCheckGenerator): REQUIRED_KEYS = {"ColumnAName", "ColumnBName"} @@ -1552,6 +1856,172 @@ def generatePredicate(self) -> str | None: return sql_query.get_predicate_sql() +class CheckGreaterThanGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName", "Value"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + val = self.params.Value + keyword = self._get_validation_keyword() + message = self.errorMessage or f"{col} {keyword} be greater than {val}." + msg_sql = message.replace("'", "''") + + condition = f"{col} IS NOT NULL AND {col} <= {self._lit(val)}" + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + predicate_sql = f"{col} IS NOT NULL AND {col} > {self._lit(val)}" + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql + ) + + def get_sample_sql(self) -> str: + col = self.params.ColumnName + val = self.params.Value + condition = f"{col} IS NOT NULL AND {col} <= {self._lit(val)}" + condition = self._apply_condition(condition) + + return f""" + SELECT {col} + FROM {{table_name}} + WHERE {condition} + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_greater_than" + + +class CheckLessOrEqualGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName", "Value"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + val = self.params.Value + keyword = self._get_validation_keyword() + message = self.errorMessage or f"{col} {keyword} be less than or equal to {val}." + msg_sql = message.replace("'", "''") + + condition = f"{col} IS NOT NULL AND {col} > {self._lit(val)}" + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + predicate_sql = f"{col} IS NOT NULL AND {col} <= {self._lit(val)}" + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql + ) + + def get_sample_sql(self) -> str: + col = self.params.ColumnName + val = self.params.Value + condition = f"{col} IS NOT NULL AND {col} > {self._lit(val)}" + condition = self._apply_condition(condition) + + return f""" + SELECT {col} + FROM {{table_name}} + WHERE {condition} + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_less_or_equal" + + +class CheckColumnComparisonGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "Comparator"} + + _VALID_COMPARATORS: ClassVar[Set[str]] = {"=", "!=", "<>", ">", ">=", "<", "<="} + + def generateSql(self) -> SQLQuery: + col_a = self.params.ColumnAName + col_b = self.params.ColumnBName + comparator = self.params.Comparator + keyword = self._get_validation_keyword() + + if comparator not in self._VALID_COMPARATORS: + raise InvalidRuleException( + f"Unsupported comparator for {self.rule_id}: {comparator}" + ) + + message = self.errorMessage or f"{col_a} {keyword} be {comparator} {col_b}." + msg_sql = message.replace("'", "''") + + pass_predicate = ( + f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b}" + ) + condition = f"NOT ({pass_predicate})" + condition = self._apply_condition(condition) + + requirement_sql = f""" + WITH invalid AS ( + SELECT 1 + FROM {{table_name}} + WHERE {condition} + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + return SQLQuery( + requirement_sql=requirement_sql.strip(), predicate_sql=pass_predicate + ) + + def get_sample_sql(self) -> str: + col_a = self.params.ColumnAName + col_b = self.params.ColumnBName + comparator = self.params.Comparator + condition = ( + f"NOT ({col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b})" + ) + condition = self._apply_condition(condition) + + return f""" + SELECT {col_a}, {col_b} + FROM {{table_name}} + WHERE {condition} + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_column_comparison" + + class CheckDistinctCountGenerator(DuckDBCheckGenerator): REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "ExpectedCount"} @@ -1605,6 +2075,66 @@ def getCheckType(self) -> str: return "distinct_count" +class CheckNoDuplicatesGenerator(DuckDBCheckGenerator): + REQUIRED_KEYS = {"ColumnName"} + + def generateSql(self) -> SQLQuery: + col = self.params.ColumnName + keyword = self._get_validation_keyword() + message = self.errorMessage or f"{col} {keyword} contain no duplicate values." + msg_sql = message.replace("'", "''") + + where_clause = f"WHERE {col} IS NOT NULL" + if self.row_condition_sql and self.row_condition_sql.strip(): + where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})" + + requirement_sql = f""" + WITH counts AS ( + SELECT {col} AS value, COUNT(*) AS occurrences + FROM {{table_name}} + {where_clause} + GROUP BY {col} + ), + invalid AS ( + SELECT value, occurrences + FROM counts + WHERE occurrences > 1 + ) + SELECT + COUNT(*) AS violations, + CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message + FROM invalid + """ + + return SQLQuery(requirement_sql=requirement_sql.strip(), predicate_sql=None) + + def get_sample_sql(self) -> str: + col = self.params.ColumnName + where_clause = f"WHERE {col} IS NOT NULL" + if self.row_condition_sql and self.row_condition_sql.strip(): + where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})" + + return f""" + WITH dupes AS ( + SELECT {col} AS value + FROM {{table_name}} + {where_clause} + GROUP BY {col} + HAVING COUNT(*) > 1 + ) + SELECT t.{col} + FROM {{table_name}} t + JOIN dupes d ON t.{col} = d.value + """ + + @property + def sample_sql(self) -> str: + return self.get_sample_sql() + + def getCheckType(self) -> str: + return "check_no_duplicates" + + class CheckModelRuleGenerator(DuckDBCheckGenerator): REQUIRED_KEYS = {"ModelRuleId"} @@ -4343,6 +4873,10 @@ class FocusToDuckDBSchemaConverter: "generator": TypeStringCheckGenerator, "factory": lambda args: "ColumnName", }, + "TypeJSON": { + "generator": TypeJSONCheckGenerator, + "factory": lambda args: "ColumnName", + }, "TypeDecimal": { "generator": TypeDecimalCheckGenerator, "factory": lambda args: "ColumnName", @@ -4367,6 +4901,10 @@ class FocusToDuckDBSchemaConverter: "generator": FormatBillingCurrencyCodeGenerator, "factory": lambda args: "ColumnName", }, + "FormatJSON": { + "generator": FormatJSONGenerator, + "factory": lambda args: "ColumnName", + }, "FormatKeyValue": { "generator": FormatJSONGenerator, "factory": lambda args: "ColumnName", @@ -4375,10 +4913,18 @@ class FocusToDuckDBSchemaConverter: "generator": FormatCurrencyGenerator, "factory": lambda args: "ColumnName", }, + "CheckColumnComparison": { + "generator": CheckColumnComparisonGenerator, + "factory": lambda args: "ColumnAName", + }, "CheckNationalCurrency": { "generator": FormatBillingCurrencyCodeGenerator, "factory": lambda args: "ColumnName", }, + "CheckGreaterThanValue": { + "generator": CheckGreaterThanGenerator, + "factory": lambda args: "ColumnName", + }, "FormatUnit": { "generator": FormatUnitGenerator, "factory": lambda args: "ColumnName", @@ -4387,10 +4933,26 @@ class FocusToDuckDBSchemaConverter: "generator": CheckValueGenerator, "factory": lambda args: "ColumnName", }, + "CheckLessOrEqualThanValue": { + "generator": CheckLessOrEqualGenerator, + "factory": lambda args: "ColumnName", + }, "CheckNotValue": { "generator": CheckNotValueGenerator, "factory": lambda args: "ColumnName", }, + "CheckNoDuplicates": { + "generator": CheckNoDuplicatesGenerator, + "factory": lambda args: "ColumnName", + }, + "CheckRegexMatch": { + "generator": CheckRegexMatchGenerator, + "factory": lambda args: "ColumnName", + }, + "CheckStringEndsWith": { + "generator": CheckStringEndsWithGenerator, + "factory": lambda args: "ColumnName", + }, "CheckSameValue": { "generator": CheckSameValueGenerator, "factory": lambda args: "ColumnAName", @@ -4415,6 +4977,10 @@ class FocusToDuckDBSchemaConverter: "generator": CheckModelRuleGenerator, "factory": lambda args: "ModelRuleId", }, + "CheckJSONSchema": { + "generator": CheckJSONSchemaGenerator, + "factory": lambda args: "ColumnName", + }, "AND": { "generator": CompositeANDRuleGenerator, "factory": lambda args: "Items", @@ -4595,6 +5161,7 @@ def __init__( transpile_dialect: Optional[str] = None, show_violations: bool = False, rules_version: Optional[str] = None, + schemas: Optional[Dict[str, Any]] = None, ) -> None: self.log = logging.getLogger(f"{__name__}.{self.__class__.__qualname__}") self.conn: duckdb.DuckDBPyConnection | None = None @@ -4608,6 +5175,7 @@ def __init__( ) self.show_violations = show_violations self.rules_version = rules_version + self.schemas = schemas or {} # Build the effective CHECK_GENERATORS mapping for this version self.CHECK_GENERATORS = self._build_check_generators_for_version(rules_version) @@ -5374,6 +5942,8 @@ def __make_generator__( rule_id=rule_id, plan=self.plan, conn=self.conn, + schemas=self.schemas, + table_name=self.table_name, parent_results_by_idx=parent_results_by_idx or {}, parent_edges=parent_edges or (), row_condition_sql=row_condition_sql, @@ -5911,15 +6481,18 @@ def _explain_check_sql(self, check) -> dict: # Conformance reference / special executor (no SQL) special = getattr(check, "special_executor", None) if callable(special): + special_kind = meta.get("special_executor_kind") return { "rule_id": rid, - "type": "reference", + "type": "special", "check_type": ctype, "generator": meta.get("generator"), "row_condition_sql": meta.get("row_condition_sql"), "referenced": getattr(check, "referenced_rule_id", None), "sql": None, # executed by reference, not SQL - "note": "mirrors referenced rule outcome (no SQL)", + "note": "mirrors referenced rule outcome (no SQL)" + if special_kind == "reference" + else "executed via special executor (no SQL)", "must_satisfy": must_satisfy, } diff --git a/focus_validator/rules/spec_rules.py b/focus_validator/rules/spec_rules.py index 3822105..d44a347 100644 --- a/focus_validator/rules/spec_rules.py +++ b/focus_validator/rules/spec_rules.py @@ -442,6 +442,7 @@ def load_rules(self) -> ValidationPlan: self.plan = val_plan self.column_types = column_types + self.model_data = model_data self._meta = { "json_rule_file": self.json_rule_file, "focus_dataset": self.focus_dataset, @@ -482,6 +483,7 @@ def validate( transpile_dialect=self.transpile_dialect, show_violations=show_violations, rules_version=self.rules_version, + schemas=getattr(self, "model_data", {}).get("Schemas", {}), ) # 1) Let the converter prepare schemas, UDFs, temp views, etc. if connection is None: @@ -620,6 +622,7 @@ def explain(self) -> Dict[str, Dict[str, Any]]: transpile_dialect=self.transpile_dialect, show_violations=False, # Not relevant for explain mode rules_version=self.rules_version, + schemas=getattr(self, "model_data", {}).get("Schemas", {}), ) # Create a minimal connection for explain mode (converter needs it for initialization) diff --git a/pyproject.toml b/pyproject.toml index 2490811..03bf649 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "focus_validator" -version = "2.1.0" +version = "2.2.0" description = "FOCUS spec validator." authors = [] readme = "README.md" @@ -26,6 +26,7 @@ requests = "*" pandera = { version = "^0.26.1" } multimethod = ">=2.0,<2.1" sqlglot = "^27.28.1" +jsonschema = "^4.25.1" numpy = { version = "^1.26"} pytz = "^2025.2" pandasql = "^0.7.3" diff --git a/tests/config_objects/test_focus_to_duckdb_generators.py b/tests/config_objects/test_focus_to_duckdb_generators.py index 1526d89..0a1c3b8 100644 --- a/tests/config_objects/test_focus_to_duckdb_generators.py +++ b/tests/config_objects/test_focus_to_duckdb_generators.py @@ -22,6 +22,7 @@ # Type generators TypeDecimalCheckGenerator, TypeStringCheckGenerator, + TypeJSONCheckGenerator, TypeDateTimeGenerator, # Format generators @@ -31,13 +32,21 @@ FormatBillingCurrencyCodeGenerator, FormatJSONGenerator, FormatCurrencyGenerator, + CheckJSONSchemaGenerator, + FocusToDuckDBSchemaConverter, # Value check generators CheckValueGenerator, CheckNotValueGenerator, + CheckRegexMatchGenerator, + CheckStringEndsWithGenerator, CheckSameValueGenerator, CheckNotSameValueGenerator, CheckGreaterOrEqualGenerator, + CheckGreaterThanGenerator, + CheckLessOrEqualGenerator, + CheckColumnComparisonGenerator, + CheckNoDuplicatesGenerator, # Column comparison generators ColumnByColumnEqualsColumnValueGenerator, @@ -223,6 +232,36 @@ def test_type_string_check_type(self): self.assertEqual(check_type, "type_string") +class TestTypeJSONGenerator(unittest.TestCase): + """Test TypeJSON SQL generation.""" + + def setUp(self): + """Set up TypeJSON generator.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "Tags-TYPE-JSON-001" + + self.generator = TypeJSONCheckGenerator( + rule=mock_rule, + rule_id="Tags-TYPE-JSON-001", + ColumnName="Tags" + ) + + def test_type_json_sql_generation(self): + """Test SQL generation for TypeJSON check.""" + sql_result = self.generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("WITH invalid AS", sql) + self.assertIn("Tags IS NOT NULL", sql) + self.assertIn("typeof(Tags) != 'JSON'", sql) + self.assertIn("Tags MUST be of type JSON", sql) + + def test_type_json_check_type(self): + """Test check type identification.""" + check_type = self.generator.getCheckType() + self.assertEqual(check_type, "type_json") + + class TestCheckValueGenerator(unittest.TestCase): """Test CheckValue SQL generation for exact value matching.""" @@ -293,6 +332,86 @@ def test_check_value_sql_injection_prevention(self): self.assertIn("O''Reilly", sql) +class TestCheckRegexMatchGenerator(unittest.TestCase): + """Test CheckRegexMatch SQL generation.""" + + def setUp(self): + """Set up CheckRegexMatch generator.""" + self.mock_rule = Mock(spec=ModelRule) + self.mock_rule.rule_id = "TEST-CHECK-REGEX" + + def test_check_regex_match_sql_generation(self): + """Test CheckRegexMatch emits regex validation SQL.""" + generator = CheckRegexMatchGenerator( + rule=self.mock_rule, + rule_id="TEST-CHECK-REGEX", + ColumnName="ContractCommitmentDurationType", + Pattern="^[1-9][0-9]*\\s+(Day|Days)$" + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("WITH invalid AS", sql) + self.assertIn("ContractCommitmentDurationType IS NOT NULL", sql) + self.assertIn( + "NOT regexp_matches(CAST(ContractCommitmentDurationType AS VARCHAR), '^[1-9][0-9]*\\s+(Day|Days)$')", + sql, + ) + self.assertIn("ContractCommitmentDurationType MUST match regex", sql) + + def test_check_regex_match_check_type(self): + """Test CheckRegexMatch check type identification.""" + generator = CheckRegexMatchGenerator( + rule=self.mock_rule, + rule_id="TEST-CHECK-REGEX", + ColumnName="ContractCommitmentDurationType", + Pattern="^[1-9][0-9]*\\s+(Day|Days)$" + ) + + self.assertEqual(generator.getCheckType(), "check_regex_match") + + +class TestCheckStringEndsWithGenerator(unittest.TestCase): + """Test CheckStringEndsWith SQL generation.""" + + def setUp(self): + """Set up CheckStringEndsWith generator.""" + self.mock_rule = Mock(spec=ModelRule) + self.mock_rule.rule_id = "TEST-CHECK-ENDSWITH" + + def test_check_string_endswith_sql_generation(self): + """Test CheckStringEndsWith emits suffix validation SQL.""" + generator = CheckStringEndsWithGenerator( + rule=self.mock_rule, + rule_id="TEST-CHECK-ENDSWITH", + ColumnName="ContractCommitmentDurationType", + Value="Years" + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("WITH invalid AS", sql) + self.assertIn("ContractCommitmentDurationType IS NOT NULL", sql) + self.assertIn( + "RIGHT(CAST(ContractCommitmentDurationType AS VARCHAR), 5) != 'Years'", + sql, + ) + self.assertIn("ContractCommitmentDurationType MUST end with ''Years''", sql) + + def test_check_string_endswith_check_type(self): + """Test CheckStringEndsWith check type identification.""" + generator = CheckStringEndsWithGenerator( + rule=self.mock_rule, + rule_id="TEST-CHECK-ENDSWITH", + ColumnName="ContractCommitmentDurationType", + Value="Years" + ) + + self.assertEqual(generator.getCheckType(), "check_string_ends_with") + + class TestFormatGenerators(unittest.TestCase): """Test various format validation generators.""" @@ -350,6 +469,148 @@ def test_format_currency_code_generator(self): self.assertIn("BillingCurrency IS NOT NULL", sql) # Should validate against ISO 4217 currency codes + def test_format_json_generator_returns_sql_query(self): + """Test FormatJSON SQL generation and predicate support.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "FORMAT-JSON-001" + + generator = FormatJSONGenerator( + rule=mock_rule, + rule_id="FORMAT-JSON-001", + ColumnName="Tags", + ) + + sql_result = generator.generateSql() + + self.assertIsInstance(sql_result, SQLQuery) + self.assertIn("WITH invalid AS", sql_result.get_requirement_sql()) + self.assertIn("Tags IS NOT NULL", sql_result.get_requirement_sql()) + self.assertIn("json_valid(CAST(Tags AS VARCHAR))", sql_result.get_requirement_sql()) + self.assertIn("json_valid(CAST(Tags AS VARCHAR))", sql_result.get_predicate_sql()) + + def test_format_json_generator_applies_row_condition_to_predicate(self): + """Test FormatJSON predicate SQL includes row-level conditions.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "FORMAT-JSON-002" + + generator = FormatJSONGenerator( + rule=mock_rule, + rule_id="FORMAT-JSON-002", + ColumnName="Tags", + row_condition_sql="ProviderName = 'AWS'", + ) + + predicate = generator.generatePredicate() + + self.assertIsNotNone(predicate) + self.assertIn("ProviderName = 'AWS'", predicate) + + def test_check_json_schema_generator_validates_rows(self): + """Test CheckJSONSchema validates JSON values against model schemas.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "CHECK-JSON-SCHEMA-001" + + generator = CheckJSONSchemaGenerator( + rule=mock_rule, + rule_id="CHECK-JSON-SCHEMA-001", + ColumnName="Payload", + Path="$", + SchemaId="TEST-SCHEMA", + schemas={ + "TEST-SCHEMA": { + "Schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "required": ["foo"], + "properties": {"foo": {"type": "string"}}, + "additionalProperties": False, + } + } + }, + row_condition_sql="ProviderName = 'AWS'", + ) + + check = generator.generateCheck() + + class FakeResult: + def __init__(self, rows): + self._rows = rows + + def fetchall(self): + return self._rows + + class FakeConn: + def __init__(self, rows): + self.rows = rows + self.last_sql = None + + def execute(self, sql): + self.last_sql = sql + return FakeResult(self.rows) + + fake_conn = FakeConn([ + ('{"foo": "ok"}',), + ('{"foo": 1}',), + ('not json',), + ]) + + ok, details = check.special_executor(fake_conn) + + self.assertFalse(ok) + self.assertEqual(details["violations"], 2) + self.assertEqual(details["schema_id"], "TEST-SCHEMA") + self.assertIn("ProviderName = 'AWS'", fake_conn.last_sql) + self.assertIn("row 2", details["failure_messages"][0]) + + def test_converter_build_check_threads_schemas_to_check_json_schema(self): + """Test converter build path passes Schemas data into CheckJSONSchema.""" + validation_criteria = ValidationCriteria( + MustSatisfy="Payload MUST conform to schema.", + Keyword="MUST", + Requirement={ + "CheckFunction": "CheckJSONSchema", + "ColumnName": "Payload", + "Path": "$", + "SchemaId": "TEST-SCHEMA", + }, + Condition={}, + Dependencies=[], + ) + + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "CHECK-JSON-SCHEMA-002" + mock_rule.validation_criteria = validation_criteria + mock_rule.is_dynamic.return_value = False + mock_rule.is_optional.return_value = False + + converter = FocusToDuckDBSchemaConverter( + focus_data=None, + explain_mode=True, + schemas={ + "TEST-SCHEMA": { + "Schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "required": ["foo"], + "properties": {"foo": {"type": "string"}}, + } + } + }, + ) + converter.conn = MagicMock() + converter.plan = SimpleNamespace(nodes=[]) + + check = converter.build_check( + rule=mock_rule, + parent_results_by_idx={}, + parent_edges=(), + rule_id="CHECK-JSON-SCHEMA-002", + node_idx=0, + ) + + self.assertEqual(check.checkType, "json_schema") + self.assertTrue(callable(check.special_executor)) + class TestComparisonGenerators(unittest.TestCase): """Test comparison and relational check generators.""" @@ -372,6 +633,45 @@ def test_check_greater_or_equal_generator(self): self.assertIn("WITH invalid AS", sql) self.assertIn("UsageQuantity < 0", sql) self.assertIn("UsageQuantity MUST be greater than or equal to 0", sql) + + def test_check_greater_than_generator(self): + """Test CheckGreaterThan SQL generation.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "CHECK-GT-001" + + generator = CheckGreaterThanGenerator( + rule=mock_rule, + rule_id="CHECK-GT-001", + ColumnName="PaymentCurrencyBilledCost", + Value=0, + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("PaymentCurrencyBilledCost <= 0", sql) + self.assertIn("PaymentCurrencyBilledCost MUST be greater than 0", sql) + + def test_check_less_or_equal_generator(self): + """Test CheckLessOrEqual SQL generation.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "CHECK-LTE-001" + + generator = CheckLessOrEqualGenerator( + rule=mock_rule, + rule_id="CHECK-LTE-001", + ColumnName="ContractCommitmentDiscountPercentage", + Value=1.0, + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("ContractCommitmentDiscountPercentage > 1.0", sql) + self.assertIn( + "ContractCommitmentDiscountPercentage MUST be less than or equal to 1.0", + sql, + ) def test_check_not_value_generator(self): """Test CheckNotValue SQL generation.""" @@ -434,6 +734,25 @@ def test_column_comparison_generator(self): sql = _extract_sql(sql_result) self.assertIn("(EffectiveCost * BilledCost)", sql) + + def test_check_column_comparison_generator(self): + """Test CheckColumnComparison SQL generation.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "COLUMN-COMPARE-002" + + generator = CheckColumnComparisonGenerator( + rule=mock_rule, + rule_id="COLUMN-COMPARE-002", + ColumnAName="BillingPeriodLastUpdated", + ColumnBName="BillingPeriodCreated", + Comparator=">=", + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("BillingPeriodLastUpdated >= BillingPeriodCreated", sql) + self.assertIn("BillingPeriodLastUpdated MUST be >= BillingPeriodCreated", sql) class TestAdvancedGenerators(unittest.TestCase): @@ -459,6 +778,24 @@ def test_check_distinct_count_generator(self): self.assertIn("COUNT(DISTINCT BillingAccountName)", sql) self.assertIn("<> 1", sql) + def test_check_no_duplicates_generator(self): + """Test CheckNoDuplicates SQL generation.""" + mock_rule = Mock(spec=ModelRule) + mock_rule.rule_id = "NO-DUPES-001" + + generator = CheckNoDuplicatesGenerator( + rule=mock_rule, + rule_id="NO-DUPES-001", + ColumnName="ContractCommitmentId", + ) + + sql_result = generator.generateSql() + sql = _extract_sql(sql_result) + + self.assertIn("GROUP BY ContractCommitmentId", sql) + self.assertIn("WHERE occurrences > 1", sql) + self.assertIn("ContractCommitmentId MUST contain no duplicate values", sql) + class TestSQLGenerationPatterns(unittest.TestCase): """Test common SQL generation patterns and utilities.""" @@ -484,8 +821,14 @@ def test_sql_template_structure(self): generators = [ TypeDecimalCheckGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn"), - TypeStringCheckGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn"), + TypeStringCheckGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn"), + TypeJSONCheckGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn"), CheckValueGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn", Value="TestValue"), + CheckRegexMatchGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn", Pattern="^ok$"), + CheckStringEndsWithGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn", Value="ok"), + CheckGreaterThanGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn", Value=0), + CheckLessOrEqualGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn", Value=1), + CheckColumnComparisonGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnAName="TestColumn", ColumnBName="OtherColumn", Comparator=">="), FormatNumericGenerator(rule=mock_rule, rule_id="TEMPLATE-TEST", ColumnName="TestColumn") ] From a44e78965a549054ae8678273cb83bacb7424a83 Mon Sep 17 00:00:00 2001 From: Mike Fuller Date: Thu, 14 May 2026 21:56:01 +1000 Subject: [PATCH 2/3] fix linter checks Signed-off-by: Mike Fuller --- .../focus_to_duckdb_converter.py | 89 +++++++++---------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/focus_validator/config_objects/focus_to_duckdb_converter.py b/focus_validator/config_objects/focus_to_duckdb_converter.py index b693968..71765ec 100644 --- a/focus_validator/config_objects/focus_to_duckdb_converter.py +++ b/focus_validator/config_objects/focus_to_duckdb_converter.py @@ -1115,7 +1115,9 @@ def generateSql(self) -> SQLQuery: message = self.errorMessage or f"{col} {keyword} be valid JSON format" msg_sql = message.replace("'", "''") - invalid_predicate = f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))" + invalid_predicate = ( + f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))" + ) condition = self._apply_condition(invalid_predicate) requirement_sql = f""" @@ -1154,8 +1156,7 @@ def generateSql(self) -> SQLQuery: schema_id = self.params.SchemaId keyword = self._get_validation_keyword() self.errorMessage = ( - self.errorMessage - or f"{col} {keyword} conform to JSON Schema '{schema_id}'" + self.errorMessage or f"{col} {keyword} conform to JSON Schema '{schema_id}'" ) return SQLQuery(requirement_sql="SELECT 0 AS violations") @@ -1175,7 +1176,9 @@ def _extract_path_value(self, payload: Any, path: str) -> Any: token = segment while token: - array_match = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)(\[(\d+)\])?(.*)$", token) + array_match = re.match( + r"^([A-Za-z_][A-Za-z0-9_]*)(\[(\d+)\])?(.*)$", token + ) if not array_match: raise InvalidRuleException( f"Unsupported JSON path segment '{segment}' for CheckJSONSchema in rule {self.rule_id}" @@ -1218,13 +1221,13 @@ def generateCheck(self) -> DuckDBColumnCheck: if row_condition: where_clauses.append(f"({row_condition})") - query = ( - f"SELECT {col} FROM {{table_name}} WHERE " + " AND ".join(where_clauses) - ) + query = f"SELECT {col} FROM {{table_name}} WHERE " + " AND ".join(where_clauses) def _exec_json_schema(conn): try: - from jsonschema import Draft202012Validator + from jsonschema import ( # type: ignore[import-untyped] + Draft202012Validator, + ) except ModuleNotFoundError as exc: raise RuntimeError( "CheckJSONSchema requires the 'jsonschema' package to be installed" @@ -1270,27 +1273,33 @@ def _exec_json_schema(conn): for row_num, row in enumerate(rows, start=1): raw_value = row[0] if isinstance(row, (tuple, list)) else row try: - payload = json.loads(raw_value) if isinstance(raw_value, str) else raw_value + payload = ( + json.loads(raw_value) + if isinstance(raw_value, str) + else raw_value + ) except Exception as exc: violations += 1 failure_messages.append(f"row {row_num}: invalid JSON ({exc})") continue instance = self._extract_path_value(payload, path) - errors = sorted(validator.iter_errors(instance), key=lambda err: list(err.path)) + errors = sorted( + validator.iter_errors(instance), key=lambda err: list(err.path) + ) if errors: violations += 1 - failure_messages.append( - f"row {row_num}: {errors[0].message}" - ) + failure_messages.append(f"row {row_num}: {errors[0].message}") ok = violations == 0 details = { "violations": violations, "schema_id": schema_id, - "message": self.errorMessage - if ok - else f"{self.errorMessage}. First error: {failure_messages[0]}", + "message": ( + self.errorMessage + if ok + else f"{self.errorMessage}. First error: {failure_messages[0]}" + ), } if failure_messages: details["failure_messages"] = failure_messages[:5] @@ -1479,9 +1488,7 @@ def generateSql(self) -> SQLQuery: message = self.errorMessage or f"{col} {keyword} match regex '{pattern}'." msg_sql = message.replace("'", "''") - condition = ( - f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" - ) + condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" condition = self._apply_condition(condition) requirement_sql = f""" @@ -1496,9 +1503,7 @@ def generateSql(self) -> SQLQuery: FROM invalid """ - predicate_sql = ( - f"{col} IS NOT NULL AND regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" - ) + predicate_sql = f"{col} IS NOT NULL AND regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" return SQLQuery( requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql @@ -1509,9 +1514,7 @@ def get_sample_sql(self) -> str: pattern = self.params.Pattern pattern_sql = str(pattern).replace("'", "''") - condition = ( - f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" - ) + condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')" condition = self._apply_condition(condition) return f""" @@ -1540,9 +1543,7 @@ def generateSql(self) -> SQLQuery: message = self.errorMessage or f"{col} {keyword} end with '{value}'." msg_sql = message.replace("'", "''") - condition = ( - f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" - ) + condition = f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" condition = self._apply_condition(condition) requirement_sql = f""" @@ -1557,9 +1558,7 @@ def generateSql(self) -> SQLQuery: FROM invalid """ - predicate_sql = ( - f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) = '{value_sql}'" - ) + predicate_sql = f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) = '{value_sql}'" return SQLQuery( requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql @@ -1571,9 +1570,7 @@ def get_sample_sql(self) -> str: value_sql = str(value).replace("'", "''") value_len = len(str(value)) - condition = ( - f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" - ) + condition = f"{col} IS NOT NULL AND RIGHT(CAST({col} AS VARCHAR), {value_len}) != '{value_sql}'" condition = self._apply_condition(condition) return f""" @@ -1914,7 +1911,9 @@ def generateSql(self) -> SQLQuery: col = self.params.ColumnName val = self.params.Value keyword = self._get_validation_keyword() - message = self.errorMessage or f"{col} {keyword} be less than or equal to {val}." + message = ( + self.errorMessage or f"{col} {keyword} be less than or equal to {val}." + ) msg_sql = message.replace("'", "''") condition = f"{col} IS NOT NULL AND {col} > {self._lit(val)}" @@ -1977,9 +1976,7 @@ def generateSql(self) -> SQLQuery: message = self.errorMessage or f"{col_a} {keyword} be {comparator} {col_b}." msg_sql = message.replace("'", "''") - pass_predicate = ( - f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b}" - ) + pass_predicate = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b}" condition = f"NOT ({pass_predicate})" condition = self._apply_condition(condition) @@ -2003,9 +2000,7 @@ def get_sample_sql(self) -> str: col_a = self.params.ColumnAName col_b = self.params.ColumnBName comparator = self.params.Comparator - condition = ( - f"NOT ({col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b})" - ) + condition = f"NOT ({col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b})" condition = self._apply_condition(condition) return f""" @@ -5872,7 +5867,7 @@ def __make_generator__( gen_cls = reg["generator"] # Strip reserved + 'CheckFunction' and pass as-is (no aliasing) - reserved = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set() + reserved: set = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set() params = { k: v for k, v in requirement.items() @@ -6253,7 +6248,7 @@ def _compile_condition_with_generators( gen_cls = reg["generator"] # Basic required-key validation (optional) - required = getattr(gen_cls, "REQUIRED_KEYS", set()) or set() + required: set = getattr(gen_cls, "REQUIRED_KEYS", set()) or set() missing = [k for k in required if k not in spec] if missing: # For conditions, you can choose to return None or raise @@ -6490,9 +6485,11 @@ def _explain_check_sql(self, check) -> dict: "row_condition_sql": meta.get("row_condition_sql"), "referenced": getattr(check, "referenced_rule_id", None), "sql": None, # executed by reference, not SQL - "note": "mirrors referenced rule outcome (no SQL)" - if special_kind == "reference" - else "executed via special executor (no SQL)", + "note": ( + "mirrors referenced rule outcome (no SQL)" + if special_kind == "reference" + else "executed via special executor (no SQL)" + ), "must_satisfy": must_satisfy, } From 3010874aac5f19975091b41487f7baea52c563df Mon Sep 17 00:00:00 2001 From: Mike Fuller Date: Thu, 14 May 2026 22:11:36 +1000 Subject: [PATCH 3/3] fix failing test Signed-off-by: Mike Fuller --- focus_validator/data_loaders/parquet_data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/focus_validator/data_loaders/parquet_data_loader.py b/focus_validator/data_loaders/parquet_data_loader.py index 22e59cb..f90e864 100644 --- a/focus_validator/data_loaders/parquet_data_loader.py +++ b/focus_validator/data_loaders/parquet_data_loader.py @@ -163,8 +163,8 @@ def _smart_datetime_conversion( exact=False, # Allow partial matches cache=True, # Cache format inference ) - # For auto-inference, allow some nulls but require most values to convert - if candidate.null_count() < len(candidate): + # Require all values to convert successfully + if candidate.null_count() == 0: converted = candidate except Exception: pass