apache · liferoad · Jul 23, 2025 · Jul 23, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py
@@ -932,6 +932,129 @@ def __init__(self):
       # pylint: disable=useless-parent-delegation
       super().__init__()
 
+    def _unify_field_types(self, existing_type, field_type):
+      """Unify two field types, handling Optional and List types."""
+      # Extract inner types from Optional if needed
+      existing_inner = (
+          existing_type.__args__[0] if hasattr(existing_type, '__args__') and
+          len(existing_type.__args__) == 1 else existing_type)
+      field_inner = (
+          field_type.__args__[0] if hasattr(field_type, '__args__') and
+          len(field_type.__args__) == 1 else field_type)
+
+      # Handle type unification more carefully
+      if existing_inner == Any or field_inner == Any:
+        return Optional[Any]
+      elif existing_inner == field_inner:
+        return Optional[existing_inner]
+      else:
+        # Check for list types and prioritize them over other types
+        from apache_beam.typehints import typehints as th
+        existing_is_list = (
+            hasattr(existing_inner, '__origin__') and
+            existing_inner.__origin__ in (list, th.List))
+        field_is_list = (
+            hasattr(field_inner, '__origin__') and
+            field_inner.__origin__ in (list, th.List))
+
+        if existing_is_list and field_is_list:
+          # Both are list types, unify their element types
+          existing_elem = existing_inner.__args__[
+              0] if existing_inner.__args__ else Any
+          field_elem = field_inner.__args__[0] if field_inner.__args__ else Any
+          if existing_elem == field_elem:
+            return Optional[th.List[existing_elem]]
+          else:
+            return Optional[th.List[Any]]
+        elif existing_is_list:
+          # Existing is list, keep it as list type
+          return Optional[existing_inner]
+        elif field_is_list:
+          # New field is list, use list type
+          return Optional[field_inner]
+        else:
+          # Neither is a list, use Any to avoid unsupported Union
+          # types in schema translation
+          return Optional[Any]
+
+    def _merge_schemas(self, pcolls):
+      """Merge schemas from multiple PCollections to create a unified schema.
+
+      This function creates a unified schema that contains all fields from all
+      input PCollections. Fields are made optional to handle missing values.
+      """
+      from apache_beam.typehints.schemas import named_fields_from_element_type
+
+      # Collect all schemas
+      schemas = []
+      for pcoll in pcolls:
+        if hasattr(pcoll, 'element_type') and pcoll.element_type:
+          try:
+            fields = named_fields_from_element_type(pcoll.element_type)
+            schemas.append(dict(fields))
+          except (ValueError, TypeError):
+            # If we can't extract schema, skip this PCollection
+            continue
+
+      if not schemas:
+        return None
+
+      # Merge all field names and types, making them optional
+      all_fields = {}
+      for schema in schemas:
+        for field_name, field_type in schema.items():
+          if field_name in all_fields:
+            # If field exists with different type, use Union
+            existing_type = all_fields[field_name]
+            if existing_type != field_type:
+              all_fields[field_name] = self._unify_field_types(
+                  existing_type, field_type)
+          else:
+            # Make field optional since not all PCollections may have it
+            all_fields[field_name] = Optional[field_type]
+
+      # Create unified schema
+      if all_fields:
+        from apache_beam.typehints.schemas import named_fields_to_schema
+        from apache_beam.typehints.schemas import named_tuple_from_schema
+        unified_schema = named_fields_to_schema(list(all_fields.items()))
+        return named_tuple_from_schema(unified_schema)
+
+      return None
+
+    def _unify_element_with_schema(self, element, target_schema):
+      """Convert an element to match the target schema, preserving existing 
+      fields only."""
+      if target_schema is None:
+        return element
+
+      # If element is already a named tuple, convert to dict first
+      if hasattr(element, '_asdict'):
+        element_dict = element._asdict()
+      elif isinstance(element, dict):
+        element_dict = element
+      else:
+        return element
+
+      # Create new element with only the fields that exist in the original
+      # element plus None for fields that are expected but missing
+      unified_dict = {}
+      for field_name in target_schema._fields:
+        if field_name in element_dict:
+          value = element_dict[field_name]
+          # Ensure the value matches the expected type
+          # This is particularly important for list fields
+          if value is not None and not isinstance(value, list) and hasattr(
+              value, '__iter__') and not isinstance(value, (str, bytes)):
+            # Convert iterables to lists if needed
+            unified_dict[field_name] = list(value)
+          else:
+            unified_dict[field_name] = value
+        else:
+          unified_dict[field_name] = None
+
+      return target_schema(**unified_dict)
+
     def expand(self, pcolls):
       if isinstance(pcolls, beam.PCollection):
         pipeline_arg = {}
@@ -942,7 +1065,28 @@ def expand(self, pcolls):
       else:
         pipeline_arg = {'pipeline': pcolls.pipeline}
         pcolls = ()
-      return pcolls | beam.Flatten(**pipeline_arg)
+
+      if not pcolls:
+        return pcolls | beam.Flatten(**pipeline_arg)
+
+      # Try to unify schemas
+      unified_schema = self._merge_schemas(pcolls)
+
+      if unified_schema is None:
+        # No schema unification needed, use standard flatten
+        return pcolls | beam.Flatten(**pipeline_arg)
+
+      # Apply schema unification to each PCollection
+      unified_pcolls = []
+      for i, pcoll in enumerate(pcolls):
+        unified_pcoll = pcoll | f'UnifySchema{i}' >> beam.Map(
+            lambda element, schema=unified_schema: self.
+            _unify_element_with_schema(element, schema)).with_output_types(
+                unified_schema)
+        unified_pcolls.append(unified_pcoll)
+
+      # Flatten the unified PCollections
+      return unified_pcolls | beam.Flatten(**pipeline_arg)
 
   class WindowInto(beam.PTransform):
     # pylint: disable=line-too-long

diff --git a/sdks/python/apache_beam/yaml/yaml_transform.py b/sdks/python/apache_beam/yaml/yaml_transform.py
@@ -861,8 +861,8 @@ def preprocess_flattened_inputs(spec):
       def all_inputs(t):
         for key, values in t.get('input', {}).items():
           if isinstance(values, list):
-            for ix, values in enumerate(values):
-              yield f'{key}{ix}', values
+            for ix, value in enumerate(values):
+              yield f'{key}{ix}', value
           else:
             yield key, values