From 7aff4f5f9efa97ba2ce2167692126b583d03bda9 Mon Sep 17 00:00:00 2001 From: Bruce Guenter Date: Fri, 13 Jun 2025 16:13:14 -0600 Subject: [PATCH 1/2] chore(config): Fix `InlineSingleUseReferencesVisitor` failing tests The above visitor is intended to inline schemas in the references section for which there is only a single reference. This visitor is, however, unable to detect some cases where there are in fact multiple references, ending up inlining the schema multiple times. This has led to bloat in generated schemas. In the core Vector schema, this affects 121 structures, shaving 844 lines off of the generated schema. --- .../src/schema/visitors/inline_single.rs | 167 ++++++++++++++---- 1 file changed, 131 insertions(+), 36 deletions(-) diff --git a/lib/vector-config/src/schema/visitors/inline_single.rs b/lib/vector-config/src/schema/visitors/inline_single.rs index 4aee9499b2398..92e8576711faf 100644 --- a/lib/vector-config/src/schema/visitors/inline_single.rs +++ b/lib/vector-config/src/schema/visitors/inline_single.rs @@ -41,7 +41,7 @@ impl Visitor for InlineSingleUseReferencesVisitor { // entire schema, by visiting the root schema in a recursive fashion, using a helper visitor. let mut occurrence_visitor = OccurrenceVisitor::default(); occurrence_visitor.visit_root_schema(root); - let occurrence_map = occurrence_visitor.into_occurrences(); + let occurrence_map = occurrence_visitor.occurrence_map; self.eligible_to_inline = occurrence_map .into_iter() @@ -141,16 +141,7 @@ fn is_inlineable_schema(definition_name: &str, schema: &SchemaObject) -> bool { #[derive(Debug, Default)] struct OccurrenceVisitor { scope_stack: SchemaScopeStack, - occurrence_map: HashMap>, -} - -impl OccurrenceVisitor { - fn into_occurrences(self) -> HashMap { - self.occurrence_map - .into_iter() - .map(|(k, v)| (k, v.len())) - .collect() - } + occurrence_map: HashMap, } impl Visitor for OccurrenceVisitor { @@ -162,23 +153,11 @@ impl Visitor for OccurrenceVisitor { visit_schema_object_scoped(self, definitions, schema); if let Some(current_schema_ref) = schema.reference.as_ref() { - // Track the named "parent" schema for the schema we're currently visiting so that if we - // visit this schema again, we don't double count any schema references that it has. The - // "parent" schema is simply the closest ancestor schema that was itself a schema - // reference, or "Root", which represents the oldest schema ancestor in the document. - // - // This lets us couple with scenarios where schema A references schema B, and is the - // only actual direct schema reference to schema B, but due to multiple schemas - // referencing schema A, would otherwise lead to both A and B being visited multiple - // times. - let current_parent_schema_ref = self.get_current_schema_scope().clone(); let current_schema_ref = get_cleaned_schema_reference(current_schema_ref); - - let occurrences = self + *self .occurrence_map .entry(current_schema_ref.into()) - .or_default(); - occurrences.insert(current_parent_schema_ref); + .or_default() += 1; } } } @@ -257,17 +236,7 @@ mod tests { assert_schemas_eq(expected_schema, actual_schema); } - // TODO(tobz): These two tests are ignored because the inliner currently works off of schema - // reference scopes, so two object properties within the same schema don't count as two - // instances of a schema being referenced. - // - // We need to refactor schema scopes to be piecemeal extensible more in the way of how - // `jsonschema` does it rather than an actual stack.... the current approach is good enough for - // now, but not optimal in the way that it could be. - // - // These are here for when I improve the situation after getting this merged. #[test] - #[ignore] fn single_ref_multiple_usages() { let mut actual_schema = as_schema(json!({ "definitions": { @@ -294,7 +263,6 @@ mod tests { } #[test] - #[ignore] fn multiple_refs_mixed_usages() { let mut actual_schema = as_schema(json!({ "definitions": { @@ -346,4 +314,131 @@ mod tests { assert_schemas_eq(expected_schema, actual_schema); } + + #[test] + fn reference_in_multiple_arrays() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "item": { + "type": "object", + "properties": { + "x": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "arr1": { "type": "array", "items": { "$ref": "#/definitions/item" } }, + "arr2": { "type": "array", "items": { "$ref": "#/definitions/item" } } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn reference_in_oneof_anyof_allof() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "shared": { + "type": "object", + "properties": { + "y": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "choice": { + "oneOf": [ + { "$ref": "#/definitions/shared" }, + { "$ref": "#/definitions/shared" } + ], + "anyOf": [ + { "$ref": "#/definitions/shared" }, + { "type": "null" } + ], + "allOf": [ + { "$ref": "#/definitions/shared" }, + { "type": "object" } + ] + } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn reference_in_additional_properties() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "val": { + "type": "object", + "properties": { + "z": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "obj1": { + "type": "object", + "additionalProperties": { "$ref": "#/definitions/val" } + }, + "obj2": { + "type": "object", + "additionalProperties": { "$ref": "#/definitions/val" } + } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn reference_in_pattern_properties() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "pat": { + "type": "object", + "properties": { + "w": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "obj": { + "type": "object", + "patternProperties": { + "^foo$": { "$ref": "#/definitions/pat" }, + "^bar$": { "$ref": "#/definitions/pat" } + } + } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } } From e81f5d9458e156bcfc637b3ddbc0adb8c816795f Mon Sep 17 00:00:00 2001 From: Bruce Guenter Date: Mon, 16 Jun 2025 15:19:30 -0600 Subject: [PATCH 2/2] Regenerate component docs --- website/cue/reference/components/sinks/base/http.cue | 8 ++------ .../cue/reference/components/sinks/base/opentelemetry.cue | 8 ++------ .../reference/components/sources/base/static_metrics.cue | 8 ++------ 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/website/cue/reference/components/sinks/base/http.cue b/website/cue/reference/components/sinks/base/http.cue index 286cf7dd842fb..71f7bb5945545 100644 --- a/website/cue/reference/components/sinks/base/http.cue +++ b/website/cue/reference/components/sinks/base/http.cue @@ -683,12 +683,8 @@ base: components: sinks: http: configuration: { } } method: { - description: """ - HTTP method. - - The HTTP method to use when making the request. - """ - required: false + description: "The HTTP method to use when making the request." + required: false type: string: { default: "post" enum: { diff --git a/website/cue/reference/components/sinks/base/opentelemetry.cue b/website/cue/reference/components/sinks/base/opentelemetry.cue index 0f9d2a62585c1..f7bb74741c554 100644 --- a/website/cue/reference/components/sinks/base/opentelemetry.cue +++ b/website/cue/reference/components/sinks/base/opentelemetry.cue @@ -686,12 +686,8 @@ base: components: sinks: opentelemetry: configuration: protocol: { } } method: { - description: """ - HTTP method. - - The HTTP method to use when making the request. - """ - required: false + description: "The HTTP method to use when making the request." + required: false type: string: { default: "post" enum: { diff --git a/website/cue/reference/components/sources/base/static_metrics.cue b/website/cue/reference/components/sources/base/static_metrics.cue index 939f4b8471fbc..78f34bfb1a5d3 100644 --- a/website/cue/reference/components/sources/base/static_metrics.cue +++ b/website/cue/reference/components/sources/base/static_metrics.cue @@ -218,12 +218,8 @@ base: components: sources: static_metrics: configuration: { type: float: {} } bins: { - description: """ - A split representation of sketch bins. - - The bins within the sketch. - """ - required: true + description: "The bins within the sketch." + required: true type: object: options: { k: { description: "The bin keys."