diff --git a/.github/trigger_files/beam_PostCommit_Python.json b/.github/trigger_files/beam_PostCommit_Python.json index 06bd728be6d7..00e0c3c25433 100644 --- a/.github/trigger_files/beam_PostCommit_Python.json +++ b/.github/trigger_files/beam_PostCommit_Python.json @@ -1,5 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "modification": 33 + "modification": 27 } diff --git a/.github/workflows/beam_Infrastructure_SecurityLogging.yml b/.github/workflows/beam_Infrastructure_SecurityLogging.yml new file mode 100644 index 000000000000..c364056f5683 --- /dev/null +++ b/.github/workflows/beam_Infrastructure_SecurityLogging.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This workflow works with the GCP security log analyzer to +# generate weekly security reports and initialize log sinks + +name: GCP Security Log Analyzer + +on: + workflow_dispatch: + schedule: + # Once a week at 9:00 AM on Monday + - cron: '0 9 * * 1' + push: + paths: + - 'infra/security/config.yml' + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + contents: read + +jobs: + beam_GCP_Security_LogAnalyzer: + name: GCP Security Log Analysis + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.13' + + - name: Install Python dependencies + working-directory: ./infra/security + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Setup gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Initialize Log Sinks + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' + working-directory: ./infra/security + run: python log_analyzer.py --config config.yml initialize + + - name: Generate Weekly Security Report + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + working-directory: ./infra/security + env: + SMTP_SERVER: smtp.gmail.com + SMTP_PORT: 465 + EMAIL_ADDRESS: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_ADDRESS }} + EMAIL_PASSWORD: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_PASSWORD }} + EMAIL_RECIPIENT: "dev@beam.apache.org" + run: python log_analyzer.py --config config.yml generate-report --dry-run diff --git a/.github/workflows/beam_PreCommit_Python_Coverage.yml b/.github/workflows/beam_PreCommit_Python_Coverage.yml index b21ad50e9da2..3da51a2eceda 100644 --- a/.github/workflows/beam_PreCommit_Python_Coverage.yml +++ b/.github/workflows/beam_PreCommit_Python_Coverage.yml @@ -58,35 +58,45 @@ env: jobs: beam_PreCommit_Python_Coverage: - name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - runs-on: [self-hosted, ubuntu-20.04, highmem] + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) (${{ join(matrix.os, ', ') }}) + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: job_name: [beam_PreCommit_Python_Coverage] job_phrase: [Run Python_Coverage PreCommit] + python_version: ['3.9'] + # Run on both self-hosted and GitHub-hosted runners. + # Some tests (marked require_docker_in_docker) can't run on Beam's + # self-hosted runners due to Docker-in-Docker environment constraint. + # These tests will only execute on ubuntu-latest (GitHub-hosted). + # Context: https://github.com/apache/beam/pull/35585 + # Temporary removed the ubuntu-latest env till resolving deps issues. + os: [[self-hosted, ubuntu-20.04, highmem]] timeout-minutes: 180 if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || - github.event.comment.body == 'Run Python_Coverage PreCommit' + startswith(github.event.comment.body, 'Run Python_Coverage PreCommit 3.') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase }} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) (${{ join(matrix.os, ', ') }}) - name: Setup environment uses: ./.github/actions/setup-environment-action with: java-version: default - python-version: default + python-version: ${{ matrix.python_version }} - name: Start DinD uses: ./.github/actions/dind-up-action id: dind + if: contains(matrix.os, 'self-hosted') with: # Enable all the new features cleanup-dind-on-start: "true" @@ -97,9 +107,9 @@ jobs: export-gh-env: "true" - name: Run preCommitPyCoverage env: - DOCKER_HOST: ${{ steps.dind.outputs.docker-host }} + DOCKER_HOST: ${{ contains(matrix.os, 'self-hosted') && steps.dind.outputs.docker-host || '' }} TOX_TESTENV_PASSENV: "DOCKER_*,TESTCONTAINERS_*,TC_*,BEAM_*,GRPC_*,OMP_*,OPENBLAS_*,PYTHONHASHSEED,PYTEST_*" - TESTCONTAINERS_HOST_OVERRIDE: ${{ env.DIND_IP }} + TESTCONTAINERS_HOST_OVERRIDE: ${{ contains(matrix.os, 'self-hosted') && env.DIND_IP || '' }} TESTCONTAINERS_DOCKER_SOCKET_OVERRIDE: "/var/run/docker.sock" TESTCONTAINERS_RYUK_DISABLED: "false" TESTCONTAINERS_RYUK_CONTAINER_PRIVILEGED: "true" @@ -110,6 +120,12 @@ jobs: uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:tox:py39:preCommitPyCoverage + arguments: | + -Pposargs="${{ + contains(matrix.os, 'self-hosted') && + '-m (not require_docker_in_docker)' || + '-m require_docker_in_docker' + }}" - uses: codecov/codecov-action@v3 with: flags: python @@ -118,7 +134,7 @@ jobs: uses: actions/upload-artifact@v4 if: failure() with: - name: Python Test Results + name: Python ${{ matrix.python_version }} Test Results (${{ join(matrix.os, ', ') }}) path: '**/pytest*.xml' - name: Publish Python Test Results env: @@ -129,4 +145,5 @@ jobs: commit: '${{ env.prsha || env.GITHUB_SHA }}' comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/pytest*.xml' - large_files: true \ No newline at end of file + large_files: true + check_name: "Python ${{ matrix.python_version }} Test Results (${{ join(matrix.os, ', ') }})" diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 012c8d225714..95a33a8520d8 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -840,7 +840,9 @@ class BeamModulePlugin implements Plugin { log4j2_log4j12_api : "org.apache.logging.log4j:log4j-1.2-api:$log4j2_version", mockito_core : "org.mockito:mockito-core:4.11.0", mockito_inline : "org.mockito:mockito-inline:4.11.0", - mongo_java_driver : "org.mongodb:mongo-java-driver:3.12.11", + mongo_java_driver : "org.mongodb:mongodb-driver-sync:5.5.0", + mongo_bson : "org.mongodb:bson:5.5.0", + mongodb_driver_core : "org.mongodb:mongodb-driver-core:5.5.0", nemo_compiler_frontend_beam : "org.apache.nemo:nemo-compiler-frontend-beam:$nemo_version", netty_all : "io.netty:netty-all:$netty_version", netty_handler : "io.netty:netty-handler:$netty_version", diff --git a/infra/security/README.md b/infra/security/README.md new file mode 100644 index 000000000000..0e60c4b33043 --- /dev/null +++ b/infra/security/README.md @@ -0,0 +1,84 @@ + + +# GCP Security Analyzer + +This document describes the implementation of a security analyzer for Google Cloud Platform (GCP) resources. The analyzer is designed to enhance security monitoring within our GCP environment by capturing critical events and generating alerts for specific security-sensitive actions. + +## How It Works + +1. **Log Sinks**: The system uses [GCP Log Sinks](https://cloud.google.com/logging/docs/export/configure_export_v2) to capture specific security-related log entries. These sinks are configured to filter for events like IAM policy changes or service account key creation. +2. **Log Storage**: The filtered logs are routed to a dedicated Google Cloud Storage (GCS) bucket for persistence and analysis. +3. **Report Generation**: A scheduled job runs weekly, executing the `log_analyzer.py` script. +4. **Email Alerts**: The script analyzes the logs from the past week, compiles a summary of security events, and sends a report to a configured email address. + +## Configuration + +The behavior of the log analyzer is controlled by a `config.yml` file. Here’s an overview of the configuration options: + +- `project_id`: The GCP project ID where the resources are located. +- `bucket_name`: The name of the GCS bucket where logs will be stored. +- `logging`: Configures the logging level and format for the script. +- `sinks`: A list of log sinks to be created. Each sink has the following properties: + - `name`: A unique name for the sink. + - `description`: A brief description of what the sink monitors. + - `filter_methods`: A list of GCP API methods to include in the filter (e.g., `SetIamPolicy`). + - `excluded_principals`: A list of service accounts or user emails to exclude from monitoring, such as CI/CD service accounts. + +### Example Configuration (`config.yml`) + +```yaml +project_id: your-gcp-project-id +bucket_name: your-log-storage-bucket + +sinks: + - name: iam-policy-changes + description: Monitors changes to IAM policies. + filter_methods: + - "SetIamPolicy" + excluded_principals: + - "ci-cd-account@your-project.iam.gserviceaccount.com" +``` + +## Usage + +The `log_analyzer.py` script provides two main commands for managing the security analyzer. + +### Initializing Sinks + +To create or update the log sinks in GCP based on your `config.yml` file, run the following command: + +```bash +python log_analyzer.py --config config.yml initialize +``` + +This command ensures that the log sinks are correctly configured to capture the desired security events. + +### Generating Weekly Reports + +To generate and send the weekly security report, run this command: + +```bash +python log_analyzer.py --config config.yml generate-report +``` + +This is typically run as a scheduled job (GitHub Action) to automate the delivery of weekly security reports. + + + diff --git a/infra/security/config.yml b/infra/security/config.yml new file mode 100644 index 000000000000..9565623be16d --- /dev/null +++ b/infra/security/config.yml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +project_id: testing-me-460223 + +# Logging +logging: + level: DEBUG + format: "[%(asctime)s] %(levelname)s: %(message)s" + +# gcloud storage bucket +bucket_name: "testing-me-460223-tfstate" + +# GCP Log sinks +sinks: + - name: iam-policy-changes + description: Monitors changes to IAM policies, excluding approved CI/CD service accounts. + filter_methods: + - "SetIamPolicy" + excluded_principals: + - beam-github-actions@apache-beam-testing.iam.gserviceaccount.com + - github-self-hosted-runners@apache-beam-testing.iam.gserviceaccount.com + + - name: sa-key-management + description: Monitors creation and deletion of service account keys. + filter_methods: + - "google.iam.admin.v1.IAM.CreateServiceAccountKey" + - "google.iam.admin.v1.IAM.DeleteServiceAccountKey" + excluded_principals: + - beam-github-actions@apache-beam-testing.iam.gserviceaccount.com + - github-self-hosted-runners@apache-beam-testing.iam.gserviceaccount.com diff --git a/infra/security/log_analyzer.py b/infra/security/log_analyzer.py new file mode 100644 index 000000000000..55ab4495e24f --- /dev/null +++ b/infra/security/log_analyzer.py @@ -0,0 +1,333 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import ssl +import yaml +import logging +import smtplib +import os +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from google.cloud import logging_v2 +from google.cloud import storage +from typing import List, Dict, Any +import argparse + +REPORT_SUBJECT = "Weekly IAM Security Events Report" +REPORT_BODY_TEMPLATE = """ +Hello Team, + +Please find below the summary of IAM security events for the past week: + +{event_summary} + +Best Regards, +Automated GitHub Action +""" + +@dataclass +class SinkCls: + name: str + description: str + filter_methods: List[str] + excluded_principals: List[str] + +class LogAnalyzer(): + def __init__(self, project_id: str, gcp_bucket: str, logger: logging.Logger, sinks: List[SinkCls]): + self.project_id = project_id + self.bucket = gcp_bucket + self.logger = logger + self.sinks = sinks + + def _construct_filter(self, sink: SinkCls) -> str: + """ + Constructs a filter string for a given sink. + + Args: + sink (Sink): The sink object containing filter information. + + Returns: + str: The constructed filter string. + """ + + method_filters = [] + for method in sink.filter_methods: + method_filters.append(f'protoPayload.methodName="{method}"') + + exclusion_filters = [] + for principal in sink.excluded_principals: + exclusion_filters.append(f'protoPayload.authenticationInfo.principalEmail != "{principal}"') + + if method_filters and exclusion_filters: + filter_ = f"({' OR '.join(method_filters)}) AND ({' AND '.join(exclusion_filters)})" + elif method_filters: + filter_ = f"({' OR '.join(method_filters)})" + elif exclusion_filters: + filter_ = f"({' AND '.join(exclusion_filters)})" + else: + filter_ = "" + + return filter_ + + def _create_log_sink(self, sink: SinkCls) -> None: + """ + Creates a log sink in GCP if it doesn't already exist. + If it already exists, it updates the sink with the new filter in case the filter has changed. + + Args: + sink (Sink): The sink object to create. + """ + logging_client = logging_v2.Client(project=self.project_id) + filter_ = self._construct_filter(sink) + destination = "storage.googleapis.com/{bucket}".format(bucket=self.bucket) + + sink_client = logging_client.sink(sink.name, filter_=filter_, destination=destination) + + if sink_client.exists(): + self.logger.debug(f"Sink {sink.name} already exists.") + sink_client.reload() + if sink_client.filter_ != filter_: + sink_client.filter_ = filter_ + sink_client.update() + self.logger.info(f"Updated sink {sink.name}'s filter.") + else: + sink_client.create() + self.logger.info(f"Created sink {sink.name}.") + # Reload the sink to get the writer_identity, this may take a few moments + sink_client.reload() + + self._grant_bucket_permissions(sink_client) + + logging_client.close() + + def _grant_bucket_permissions(self, sink: logging_v2.Sink) -> None: + """ + Grants a log sink's writer identity permissions to write to the bucket. + """ + logging_client = logging_v2.Client(project=self.project_id) + storage_client = storage.Client(project=self.project_id) + + sink.reload() + writer_identity = sink.writer_identity + if not writer_identity: + self.logger.warning(f"Could not retrieve writer identity for sink {sink.name}. " + f"Manual permission granting might be required.") + return + + bucket = storage_client.get_bucket(self.bucket) + policy = bucket.get_iam_policy(requested_policy_version=3) + iam_role = "roles/storage.objectCreator" + + # Workaround for projects where the writer_identity is not a valid service account. + if writer_identity == "serviceAccount:cloud-logs@system.gserviceaccount.com": + member = "group:cloud-logs@google.com" + else: + member = f"serviceAccount:{writer_identity}" + + # Check if the policy is already configured + if any(member in b.get("members", []) and b.get("role") == iam_role for b in policy.bindings): + self.logger.debug(f"Sink {sink.name} already has the necessary permissions.") + return + + policy.bindings.append({ + "role": iam_role, + "members": {member} + }) + + bucket.set_iam_policy(policy) + self.logger.info(f"Granted {iam_role} to {member} on bucket {self.bucket} for sink {sink.name}.") + + def initialize_sinks(self) -> None: + for sink in self.sinks: + self._create_log_sink(sink) + self.logger.info(f"Initialized sink: {sink.name}") + + def get_event_logs(self, days: int = 7) -> List[Dict[str, Any]]: + """ + Reads and retrieves log events from the specified time range from the GCP Cloud Storage bucket. + + Args: + days (int): The number of days to look back for log analysis. + + Returns: + List[Dict[str, Any]]: A list of log entries that match the specified time range. + """ + found_events = [] + storage_client = storage.Client(project=self.project_id) + + now = datetime.now(timezone.utc) + end_time = now.replace(minute=0, second=0, microsecond=0) - timedelta(minutes=30) + start_time = end_time - timedelta(days=days) + + blobs = storage_client.list_blobs(self.bucket) + for blob in blobs: + if not (start_time <= blob.time_created < end_time): + continue + + self.logger.debug(f"Processing blob: {blob.name}") + content = blob.download_as_string().decode("utf-8") + + for num, line in enumerate(content.splitlines(), 1): + try: + log_entry = json.loads(line) + payload = log_entry.get("protoPayload") + if not payload: + self.logger.warning(f"Skipping log in blob {blob.name}, line {num}: no protoPayload found.") + continue + + event_details = { + "timestamp": log_entry.get("timestamp", "N/A"), + "principal": payload.get("authenticationInfo", {}).get("principalEmail", "N/A"), + "method": payload.get("methodName", "N/A"), + "resource": payload.get("resourceName", "N/A"), + "project_id": log_entry.get("resource", {}).get("labels", {}).get("project_id", "N/A"), + "file_name": blob.name + } + found_events.append(event_details) + except json.JSONDecodeError: + self.logger.warning(f"Skipping invalid JSON log in blob {blob.name}, line {num}.") + continue + + storage_client.close() + return found_events + + def create_weekly_email_report(self, dry_run: bool = False) -> None: + """ + Creates an email report based on the events found this week. + If `dry_run` is True, it will print the report to the console instead of sending it. + """ + events = self.get_event_logs(days=7) + if not events: + self.logger.info("No events found for the weekly report.") + return + + events.sort(key=lambda x: x['timestamp'], reverse=True) + event_summary = "\n".join( + f"Timestamp: {event['timestamp']}, Principal: {event['principal']}, Method: {event['method']}, Resource: {event['resource']}, Project ID: {event['project_id']}, File: {event['file_name']}" + for event in events + ) + + report_subject = REPORT_SUBJECT + report_body = REPORT_BODY_TEMPLATE.format(event_summary=event_summary) + + if dry_run: + self.logger.info("Dry run: printing email report to console.") + print(f"Subject: {report_subject}\n") + print(f"Body:\n{report_body}") + return + + self.send_email(report_subject, report_body) + + def send_email(self, subject: str, body: str) -> None: + """ + Sends an email with the specified subject and body. + If email configuration is not fully set, it prints the email instead. + + Args: + subject (str): The subject of the email. + body (str): The body of the email. + """ + smtp_server = os.getenv("SMTP_SERVER") + smtp_port_str = os.getenv("SMTP_PORT") + recipient = os.getenv("EMAIL_RECIPIENT") + email = os.getenv("EMAIL_ADDRESS") + password = os.getenv("EMAIL_PASSWORD") + + if not all([smtp_server, smtp_port_str, recipient, email, password]): + self.logger.warning("Email configuration is not fully set. Printing email instead.") + print(f"Subject: {subject}\n") + print(f"Body:\n{body}") + return + + assert smtp_server is not None + assert smtp_port_str is not None + assert recipient is not None + assert email is not None + assert password is not None + + message = f"Subject: {subject}\n\n{body}" + context = ssl.create_default_context() + + try: + smtp_port = int(smtp_port_str) + with smtplib.SMTP_SSL(smtp_server, smtp_port, context=context) as server: + server.login(email, password) + server.sendmail(email, recipient, message) + self.logger.info(f"Successfully sent email report to {recipient}") + except Exception as e: + self.logger.error(f"Failed to send email report: {e}") + +def load_config_from_yaml(config_path: str) -> Dict[str, Any]: + with open(config_path, 'r') as file: + config = yaml.safe_load(file) + + c = { + "project_id": config.get("project_id"), + "gcp_bucket": config.get("bucket_name"), + "sinks": [], + "logger": logging.getLogger(__name__) + } + + for sink_config in config.get("sinks", []): + sink = SinkCls( + name=sink_config["name"], + description=sink_config["description"], + filter_methods=sink_config.get("filter_methods", []), + excluded_principals=sink_config.get("excluded_principals", []) + ) + c["sinks"].append(sink) + + logging_config = config.get("logging", {}) + log_level = logging_config.get("level", "INFO") + log_format = logging_config.get("format", "[%(asctime)s] %(levelname)s: %(message)s") + + c["logger"].setLevel(log_level) + logging.basicConfig(level=log_level, format=log_format) + + return c + +def main(): + """ + Main entry point for the script. + """ + parser = argparse.ArgumentParser(description="GCP IAM Log Analyzer") + parser.add_argument("--config", required=True, help="Path to the configuration YAML file.") + + subparsers = parser.add_subparsers(dest="command", required=True) + + subparsers.add_parser("initialize", help="Initialize/update log sinks in GCP.") + report_parser = subparsers.add_parser("generate-report", help="Generate and send the weekly IAM security report.") + report_parser.add_argument("--dry-run", action="store_true", help="Do not send email, print report to console.") + + args = parser.parse_args() + + config = load_config_from_yaml(args.config) + log_analyzer = LogAnalyzer( + project_id=config["project_id"], + gcp_bucket=config["gcp_bucket"], + logger=config["logger"], + sinks=config["sinks"] + ) + + if args.command == "initialize": + log_analyzer.initialize_sinks() + log_analyzer.logger.info("Sinks initialized successfully.") + elif args.command == "generate-report": + log_analyzer.create_weekly_email_report(dry_run=args.dry_run) + log_analyzer.logger.info("Weekly report generation process completed.") + +if __name__ == "__main__": + main() diff --git a/infra/security/requirements.txt b/infra/security/requirements.txt new file mode 100644 index 000000000000..a4abb8bc5acf --- /dev/null +++ b/infra/security/requirements.txt @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PyYAML==6.0.2 +google-cloud-storage==3.3.0 +google-cloud-logging==3.12.1 diff --git a/it/mongodb/build.gradle b/it/mongodb/build.gradle index 6be9b91f5b34..960e15af8394 100644 --- a/it/mongodb/build.gradle +++ b/it/mongodb/build.gradle @@ -35,6 +35,7 @@ dependencies { implementation library.java.testcontainers_mongodb implementation library.java.google_code_gson implementation library.java.mongo_java_driver + implementation library.java.mongo_bson implementation library.java.vendored_guava_32_1_2_jre testImplementation library.java.mockito_core diff --git a/sdks/go.mod b/sdks/go.mod index df83d0f31e7c..7aef3b6ca0c1 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -31,14 +31,14 @@ require ( cloud.google.com/go/profiler v0.4.3 cloud.google.com/go/pubsub v1.50.0 cloud.google.com/go/spanner v1.83.0 - cloud.google.com/go/storage v1.56.0 + cloud.google.com/go/storage v1.56.1 github.com/aws/aws-sdk-go-v2 v1.38.1 - github.com/aws/aws-sdk-go-v2/config v1.31.2 - github.com/aws/aws-sdk-go-v2/credentials v1.18.6 + github.com/aws/aws-sdk-go-v2/config v1.31.3 + github.com/aws/aws-sdk-go-v2/credentials v1.18.7 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.18.5 github.com/aws/aws-sdk-go-v2/service/s3 v1.87.0 github.com/aws/smithy-go v1.22.5 - github.com/docker/go-connections v0.5.0 + github.com/docker/go-connections v0.6.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.9.3 github.com/google/go-cmp v0.7.0 @@ -60,9 +60,9 @@ require ( golang.org/x/sync v0.16.0 golang.org/x/sys v0.35.0 golang.org/x/text v0.28.0 - google.golang.org/api v0.247.0 + google.golang.org/api v0.248.0 google.golang.org/genproto v0.0.0-20250603155806-513f23925822 - google.golang.org/grpc v1.74.2 + google.golang.org/grpc v1.75.0 google.golang.org/protobuf v1.36.8 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -77,14 +77,14 @@ require ( require ( cel.dev/expr v0.24.0 // indirect - cloud.google.com/go/auth v0.16.4 // indirect + cloud.google.com/go/auth v0.16.5 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/monitoring v1.24.2 // indirect cloud.google.com/go/pubsub/v2 v2.0.0 // indirect dario.cat/mergo v1.0.1 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.3 // indirect - github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.53.0 // indirect github.com/apache/arrow/go/v15 v15.0.2 // indirect @@ -96,7 +96,7 @@ require ( github.com/distribution/reference v0.6.0 // indirect github.com/ebitengine/purego v0.8.4 // indirect github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect - github.com/go-jose/go-jose/v4 v4.0.5 // indirect + github.com/go-jose/go-jose/v4 v4.1.1 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.3.0 // indirect @@ -125,18 +125,18 @@ require ( go.opentelemetry.io/contrib/detectors/gcp v1.36.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect - go.opentelemetry.io/otel v1.36.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.36.0 // indirect - go.opentelemetry.io/otel/sdk v1.36.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.36.0 // indirect - go.opentelemetry.io/otel/trace v1.36.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect go.shabbyrobe.org/gocovmerge v0.0.0-20230507111327-fa4f82cfbf4d // indirect golang.org/x/time v0.12.0 // indirect ) require ( - cloud.google.com/go v0.121.4 // indirect + cloud.google.com/go v0.121.6 // indirect cloud.google.com/go/compute/metadata v0.8.0 // indirect cloud.google.com/go/iam v1.5.2 // indirect cloud.google.com/go/longrunning v0.6.7 // indirect @@ -156,7 +156,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.4 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.3 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.28.2 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.33.2 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.0 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.38.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -205,6 +205,6 @@ require ( golang.org/x/mod v0.26.0 // indirect golang.org/x/tools v0.35.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250721164621-a45f3dfb1074 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250818200422-3122310a409c // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 4d649d549be5..76aa0b7a5af3 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -40,8 +40,8 @@ cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRY cloud.google.com/go v0.105.0/go.mod h1:PrLgOJNe5nfE9UMxKxgXj4mD3voiP+YQ6gdt6KMFOKM= cloud.google.com/go v0.107.0/go.mod h1:wpc2eNrD7hXUTy8EKS10jkxpZBjASrORK7goS+3YX2I= cloud.google.com/go v0.110.0/go.mod h1:SJnCLqQ0FCFGSZMUNUf84MV3Aia54kn7pi8st7tMzaY= -cloud.google.com/go v0.121.4 h1:cVvUiY0sX0xwyxPwdSU2KsF9knOVmtRyAMt8xou0iTs= -cloud.google.com/go v0.121.4/go.mod h1:XEBchUiHFJbz4lKBZwYBDHV/rSyfFktk737TLDU089s= +cloud.google.com/go v0.121.6 h1:waZiuajrI28iAf40cWgycWNgaXPO06dupuS+sgibK6c= +cloud.google.com/go v0.121.6/go.mod h1:coChdst4Ea5vUpiALcYKXEpR1S9ZgXbhEzzMcMR66vI= cloud.google.com/go/accessapproval v1.4.0/go.mod h1:zybIuC3KpDOvotz59lFe5qxRZx6C75OtwbisN56xYB4= cloud.google.com/go/accessapproval v1.5.0/go.mod h1:HFy3tuiGvMdcd/u+Cu5b9NkO1pEICJ46IR82PoUdplw= cloud.google.com/go/accessapproval v1.6.0/go.mod h1:R0EiYnwV5fsRFiKZkPHr6mwyk2wxUJ30nL4j2pcFY2E= @@ -103,8 +103,8 @@ cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVo cloud.google.com/go/assuredworkloads v1.8.0/go.mod h1:AsX2cqyNCOvEQC8RMPnoc0yEarXQk6WEKkxYfL6kGIo= cloud.google.com/go/assuredworkloads v1.9.0/go.mod h1:kFuI1P78bplYtT77Tb1hi0FMxM0vVpRC7VVoJC3ZoT0= cloud.google.com/go/assuredworkloads v1.10.0/go.mod h1:kwdUQuXcedVdsIaKgKTp9t0UJkE5+PAVNhdQm4ZVq2E= -cloud.google.com/go/auth v0.16.4 h1:fXOAIQmkApVvcIn7Pc2+5J8QTMVbUGLscnSVNl11su8= -cloud.google.com/go/auth v0.16.4/go.mod h1:j10ncYwjX/g3cdX7GpEzsdM+d+ZNsXAbb6qXA7p1Y5M= +cloud.google.com/go/auth v0.16.5 h1:mFWNQ2FEVWAliEQWpAdH80omXFokmrnbDhUS9cBywsI= +cloud.google.com/go/auth v0.16.5/go.mod h1:utzRfHMP+Vv0mpOkTRQoWD2q3BatTOoWbA7gCc2dUhQ= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -575,8 +575,8 @@ cloud.google.com/go/storage v1.23.0/go.mod h1:vOEEDNFnciUMhBeT6hsJIn3ieU5cFRmzeL cloud.google.com/go/storage v1.27.0/go.mod h1:x9DOL8TK/ygDUMieqwfhdpQryTeEkhGKMi80i/iqR2s= cloud.google.com/go/storage v1.28.1/go.mod h1:Qnisd4CqDdo6BGs2AD5LLnEsmSQ80wQ5ogcBBKhU86Y= cloud.google.com/go/storage v1.29.0/go.mod h1:4puEjyTKnku6gfKoTfNOU/W+a9JyuVNxjpS5GBrB8h4= -cloud.google.com/go/storage v1.56.0 h1:iixmq2Fse2tqxMbWhLWC9HfBj1qdxqAmiK8/eqtsLxI= -cloud.google.com/go/storage v1.56.0/go.mod h1:Tpuj6t4NweCLzlNbw9Z9iwxEkrSem20AetIeH/shgVU= +cloud.google.com/go/storage v1.56.1 h1:n6gy+yLnHn0hTwBFzNn8zJ1kqWfR91wzdM8hjRF4wP0= +cloud.google.com/go/storage v1.56.1/go.mod h1:C9xuCZgFl3buo2HZU/1FncgvvOgTAs/rnh4gF4lMg0s= cloud.google.com/go/storagetransfer v1.5.0/go.mod h1:dxNzUopWy7RQevYFHewchb29POFv3/AaBgnhqzqiK0w= cloud.google.com/go/storagetransfer v1.6.0/go.mod h1:y77xm4CQV/ZhFZH75PLEXY0ROiS7Gh6pSKrM8dJyg6I= cloud.google.com/go/storagetransfer v1.7.0/go.mod h1:8Giuj1QNb1kfLAiWM1bN6dHzfdlDAVC9rv9abHot2W4= @@ -705,8 +705,8 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/GoogleCloudPlatform/cloudsql-proxy v1.29.0/go.mod h1:spvB9eLJH9dutlbPSRmHvSXXHOwGRyeXh1jVdquA2G8= github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.3 h1:2afWGsMzkIcN8Qm4mgPJKZWyroE5QBszMiDMYEBrnfw= github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.3/go.mod h1:dppbR7CwXD4pgtV9t3wD1812RaLDcBjtblcDF5f1vI0= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0 h1:ErKg/3iS1AKcTkf3yixlZ54f9U1rljCkQyEXWUnIUxc= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0/go.mod h1:yAZHSGnqScoU556rBOVkwLze6WP5N+U11RHuWaGVxwY= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 h1:UQUsRi8WTzhZntp5313l+CHIAT95ojUI2lpP/ExlZa4= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0/go.mod h1:Cz6ft6Dkn3Et6l2v2a9/RpN7epQ1GtDlO6lj8bEcOvw= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0 h1:owcC2UnmsZycprQ5RfRgjydWhuoxg71LUfyiQdijZuM= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0/go.mod h1:ZPpqegjbE99EPKsu3iUWV22A04wzGPcAY/ziSIQEEgs= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.53.0 h1:4LP6hvB4I5ouTbGgWtixJhgED6xdf67twf9PoY96Tbg= @@ -757,12 +757,12 @@ github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.0 h1:6GMWV6CNpA/6fbFH github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.0/go.mod h1:/mXlTIVG9jbxkqDnr5UQNQxW1HRYxeGklkM9vAFeabg= github.com/aws/aws-sdk-go-v2/config v1.15.3/go.mod h1:9YL3v07Xc/ohTsxFXzan9ZpFpdTOFl4X65BAKYaz8jg= github.com/aws/aws-sdk-go-v2/config v1.25.3/go.mod h1:tAByZy03nH5jcq0vZmkcVoo6tRzRHEwSFx3QW4NmDw8= -github.com/aws/aws-sdk-go-v2/config v1.31.2 h1:NOaSZpVGEH2Np/c1toSeW0jooNl+9ALmsUTZ8YvkJR0= -github.com/aws/aws-sdk-go-v2/config v1.31.2/go.mod h1:17ft42Yb2lF6OigqSYiDAiUcX4RIkEMY6XxEMJsrAes= +github.com/aws/aws-sdk-go-v2/config v1.31.3 h1:RIb3yr/+PZ18YYNe6MDiG/3jVoJrPmdoCARwNkMGvco= +github.com/aws/aws-sdk-go-v2/config v1.31.3/go.mod h1:jjgx1n7x0FAKl6TnakqrpkHWWKcX3xfWtdnIJs5K9CE= github.com/aws/aws-sdk-go-v2/credentials v1.11.2/go.mod h1:j8YsY9TXTm31k4eFhspiQicfXPLZ0gYXA50i4gxPE8g= github.com/aws/aws-sdk-go-v2/credentials v1.16.2/go.mod h1:sDdvGhXrSVT5yzBDR7qXz+rhbpiMpUYfF3vJ01QSdrc= -github.com/aws/aws-sdk-go-v2/credentials v1.18.6 h1:AmmvNEYrru7sYNJnp3pf57lGbiarX4T9qU/6AZ9SucU= -github.com/aws/aws-sdk-go-v2/credentials v1.18.6/go.mod h1:/jdQkh1iVPa01xndfECInp1v1Wnp70v3K4MvtlLGVEc= +github.com/aws/aws-sdk-go-v2/credentials v1.18.7 h1:zqg4OMrKj+t5HlswDApgvAHjxKtlduKS7KicXB+7RLg= +github.com/aws/aws-sdk-go-v2/credentials v1.18.7/go.mod h1:/4M5OidTskkgkv+nCIfC9/tbiQ/c8qTox9QcUDV0cgc= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.3/go.mod h1:uk1vhHHERfSVCUnqSqz8O48LBYDSC+k6brng09jcMOk= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.4/go.mod h1:t4i+yGHMCcUNIX1x7YVYa6bH/Do7civ5I6cG/6PMfyA= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.4 h1:lpdMwTzmuDLkgW7086jE94HweHCqG+uOJwHf3LZs7T0= @@ -816,8 +816,8 @@ github.com/aws/aws-sdk-go-v2/service/sso v1.17.2/go.mod h1:/pE21vno3q1h4bbhUOEi+ github.com/aws/aws-sdk-go-v2/service/sso v1.28.2 h1:ve9dYBB8CfJGTFqcQ3ZLAAb/KXWgYlgu/2R2TZL2Ko0= github.com/aws/aws-sdk-go-v2/service/sso v1.28.2/go.mod h1:n9bTZFZcBa9hGGqVz3i/a6+NG0zmZgtkB9qVVFDqPA8= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.20.0/go.mod h1:dWqm5G767qwKPuayKfzm4rjzFmVjiBFbOJrpSPnAMDs= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.33.2 h1:pd9G9HQaM6UZAZh19pYOkpKSQkyQQ9ftnl/LttQOcGI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.33.2/go.mod h1:eknndR9rU8UpE/OmFpqU78V1EcXPKFTTm5l/buZYgvM= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.0 h1:Bnr+fXrlrPEoR1MAFrHVsge3M/WoK4n23VNhRM7TPHI= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.0/go.mod h1:eknndR9rU8UpE/OmFpqU78V1EcXPKFTTm5l/buZYgvM= github.com/aws/aws-sdk-go-v2/service/sts v1.16.3/go.mod h1:bfBj0iVmsUyUg4weDB4NxktD9rDGeKSVWnjTnwbx9b8= github.com/aws/aws-sdk-go-v2/service/sts v1.25.3/go.mod h1:4EqRHDCKP78hq3zOnmFXu5k0j4bXbRFfCh/zQ6KnEfQ= github.com/aws/aws-sdk-go-v2/service/sts v1.38.0 h1:iV1Ko4Em/lkJIsoKyGfc0nQySi+v0Udxr6Igq+y9JZc= @@ -895,8 +895,8 @@ github.com/dnaeon/go-vcr v1.1.0/go.mod h1:M7tiix8f0r6mKKJ3Yq/kqU1OYf3MnfmBWVbPx/ github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= @@ -954,8 +954,8 @@ github.com/go-gorp/gorp v2.2.0+incompatible/go.mod h1:7IfkAQnO7jfT/9IQ3R9wL1dFhu github.com/go-ini/ini v1.25.4/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= -github.com/go-jose/go-jose/v4 v4.0.5 h1:M6T8+mKZl/+fNNuFHvGIzDz7BTLQPIounk/b9dw3AaE= -github.com/go-jose/go-jose/v4 v4.0.5/go.mod h1:s3P1lRrkT8igV8D9OjyL4WRyHvjB6a4JSllnOrmmBOA= +github.com/go-jose/go-jose/v4 v4.1.1 h1:JYhSgy4mXXzAdF3nUx3ygx347LRXJRrpgyU3adRmkAI= +github.com/go-jose/go-jose/v4 v4.1.1/go.mod h1:BdsZGqgdO3b6tTc6LSE56wcDbMMLuPsw5d4ZD5f94kA= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U= github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk= @@ -1495,22 +1495,22 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.6 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0/go.mod h1:snMWehoOh2wsEwnvvwtDyFCxVeDAODenXHtn5vzrKjo= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= -go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= -go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 h1:wpMfgF8E1rkrT1Z6meFh1NDtownE9Ii3n3X2GJYjsaU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0/go.mod h1:wAy0T/dUbs468uOlkT31xjvqQgEVXv58BRFWEgn5v/0= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0 h1:rixTyDGXFxRy1xzhKrotaHy3/KXdPhlWARrCgK+eqUY= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.36.0/go.mod h1:dowW6UsM9MKbJq5JTz2AMVp3/5iW5I/TStsk8S+CfHw= -go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE= -go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs= -go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFULbs= -go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY= -go.opentelemetry.io/otel/sdk/metric v1.36.0 h1:r0ntwwGosWGaa0CrSt8cuNuTcccMXERFwHX4dThiPis= -go.opentelemetry.io/otel/sdk/metric v1.36.0/go.mod h1:qTNOhFDfKRwX0yXOqJYegL5WRaW376QbB7P4Pb0qva4= -go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w= -go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= @@ -1981,8 +1981,8 @@ gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJ gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= -gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o= -gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= @@ -2053,8 +2053,8 @@ google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/ google.golang.org/api v0.110.0/go.mod h1:7FC4Vvx1Mooxh8C5HWjzZHcavuS2f6pmJpZx60ca7iI= google.golang.org/api v0.111.0/go.mod h1:qtFHvU9mhgTJegR31csQ+rwxyUTHOKFqCKWp1J0fdw0= google.golang.org/api v0.114.0/go.mod h1:ifYI2ZsFK6/uGddGfAD5BMxlnkBqCmqHSDUVi45N5Yg= -google.golang.org/api v0.247.0 h1:tSd/e0QrUlLsrwMKmkbQhYVa109qIintOls2Wh6bngc= -google.golang.org/api v0.247.0/go.mod h1:r1qZOPmxXffXg6xS5uhx16Fa/UFY8QU/K4bfKrnvovM= +google.golang.org/api v0.248.0 h1:hUotakSkcwGdYUqzCRc5yGYsg4wXxpkKlW5ryVqvC1Y= +google.golang.org/api v0.248.0/go.mod h1:yAFUAF56Li7IuIQbTFoLwXTCI6XCFKueOlS7S9e4F9k= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -2215,10 +2215,10 @@ google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= -google.golang.org/genproto/googleapis/api v0.0.0-20250721164621-a45f3dfb1074 h1:mVXdvnmR3S3BQOqHECm9NGMjYiRtEvDYcqAqedTXY6s= -google.golang.org/genproto/googleapis/api v0.0.0-20250721164621-a45f3dfb1074/go.mod h1:vYFwMYFbmA8vl6Z/krj/h7+U/AqpHknwJX4Uqgfyc7I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b h1:zPKJod4w6F1+nRGDI9ubnXYhU9NSWoFAijkHkUXeTK8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c h1:AtEkQdl5b6zsybXcbz00j1LwNodDuH6hVifIaNqk7NQ= +google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c/go.mod h1:ea2MjsO70ssTfCjiwHgI0ZFqcw45Ksuk2ckf9G468GA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250818200422-3122310a409c h1:qXWI/sQtv5UKboZ/zUk7h+mrf/lXORyI+n9DKDAusdg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250818200422-3122310a409c/go.mod h1:gw1tLEfykwDz2ET4a12jcXt4couGAm7IwsVaTy0Sflo= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -2261,8 +2261,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= -google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= -google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= +google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= +google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index 3239c76dfe1f..d18cc3b83732 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -79,7 +79,7 @@ func runEnvironment(ctx context.Context, j *jobservices.Job, env string, wk *wor logger.Error("unmarshaling docker environment payload", "error", err) return err } - return dockerEnvironment(ctx, logger, dp, wk, j.ArtifactEndpoint()) + return dockerEnvironment(ctx, logger, dp, wk, wk.ArtifactEndpoint) case urns.EnvProcess: pp := &pipepb.ProcessPayload{} if err := (proto.UnmarshalOptions{}).Unmarshal(e.GetPayload(), pp); err != nil { @@ -87,7 +87,7 @@ func runEnvironment(ctx context.Context, j *jobservices.Job, env string, wk *wor return err } go func() { - processEnvironment(ctx, pp, wk) + processEnvironment(ctx, logger, pp, wk) logger.Debug("environment stopped", slog.String("job", j.String())) }() return nil @@ -207,17 +207,18 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock } logger.Debug("creating container", "envs", envs, "mounts", mounts) + cmd := []string{ + fmt.Sprintf("--id=%v", wk.ID), + fmt.Sprintf("--control_endpoint=%v", wk.Endpoint()), + fmt.Sprintf("--artifact_endpoint=%v", artifactEndpoint), + fmt.Sprintf("--provision_endpoint=%v", wk.Endpoint()), + fmt.Sprintf("--logging_endpoint=%v", wk.Endpoint()), + } ccr, err := cli.ContainerCreate(ctx, &container.Config{ Image: dp.GetContainerImage(), - Cmd: []string{ - fmt.Sprintf("--id=%v", wk.ID), - fmt.Sprintf("--control_endpoint=%v", wk.Endpoint()), - fmt.Sprintf("--artifact_endpoint=%v", artifactEndpoint), - fmt.Sprintf("--provision_endpoint=%v", wk.Endpoint()), - fmt.Sprintf("--logging_endpoint=%v", wk.Endpoint()), - }, - Env: envs, - Tty: false, + Cmd: cmd, + Env: envs, + Tty: false, }, &container.HostConfig{ NetworkMode: "host", Mounts: mounts, @@ -236,6 +237,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock } logger.Debug("container started") + logger.Debug("container start command", "cmd", cmd) // Start goroutine to wait on container state. go func() { @@ -273,6 +275,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock rc, err := cli.ContainerLogs(bgctx, containerID, container.LogsOptions{Details: true, ShowStdout: true, ShowStderr: true}) if err != nil { logger.Error("docker container logs error", "error", err) + return } defer rc.Close() var buf bytes.Buffer @@ -284,8 +287,9 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock return nil } -func processEnvironment(ctx context.Context, pp *pipepb.ProcessPayload, wk *worker.W) { - cmd := exec.CommandContext(ctx, pp.GetCommand(), "--id="+wk.ID, "--provision_endpoint="+wk.Endpoint()) +func processEnvironment(ctx context.Context, logger *slog.Logger, pp *pipepb.ProcessPayload, wk *worker.W) { + cmd := exec.CommandContext(ctx, pp.GetCommand(), "--id='"+wk.ID+"'", "--provision_endpoint="+wk.Endpoint()) + logger.Debug("starting process", "cmd", cmd.String()) cmd.WaitDelay = time.Millisecond * 100 cmd.Stderr = os.Stderr @@ -296,9 +300,12 @@ func processEnvironment(ctx context.Context, pp *pipepb.ProcessPayload, wk *work cmd.Env = append(cmd.Environ(), fmt.Sprintf("%v=%v", k, v)) } if err := cmd.Start(); err != nil { + logger.Error("process failed to start", "error", err) return } // Job processing happens here, but orchestrated by other goroutines // This call blocks until the context is cancelled, or the command exits. - cmd.Wait() + if err := cmd.Wait(); err != nil { + logger.Error("process failed while running", "error", err) + } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go index f186b11fd1d8..ae0e3e73e860 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go @@ -208,7 +208,7 @@ func (j *Job) MakeWorker(env string) *worker.W { wk.EnvPb = j.Pipeline.GetComponents().GetEnvironments()[env] wk.PipelineOptions = j.PipelineOptions() wk.JobKey = j.JobKey() - wk.ArtifactEndpoint = j.ArtifactEndpoint() + wk.ResolveEndpoints(j.ArtifactEndpoint()) return wk } diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go index b4133b0332a6..1141a5b02304 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go @@ -24,6 +24,9 @@ import ( "io" "log/slog" "net" + "os" + "runtime" + "strings" "sync" "sync/atomic" @@ -58,9 +61,9 @@ type W struct { ID, Env string - JobKey, ArtifactEndpoint string - EnvPb *pipepb.Environment - PipelineOptions *structpb.Struct + JobKey, ArtifactEndpoint, endpoint string + EnvPb *pipepb.Environment + PipelineOptions *structpb.Struct // These are the ID sources inst uint64 @@ -79,8 +82,32 @@ type controlResponder interface { Respond(*fnpb.InstructionResponse) } +// resolveEndpoint checks if the worker is running inside a docker container on mac or Windows and +// if the endpoint is a "localhost" endpoint. If so, overrides it with "host.docker.internal". +// Reference: https://docs.docker.com/desktop/features/networking/#networking-mode-and-dns-behaviour-for-mac-and-windows +func (wk *W) resolveEndpoint(endpoint string) string { + // The presence of an external environment does not guarantee execution within + // Docker, as Python's LOOPBACK also runs in an external environment. + // A specific check for the "BEAM_WORKER_POOL_IN_DOCKER_VM" environment variable is required to confirm + // if the worker is running inside a Docker container. + // Python LOOPBACK mode: https://github.com/apache/beam/blob/0589b14812ec52bff9d20d3bfcd96da393b9ebdb/sdks/python/apache_beam/runners/portability/portable_runner.py#L397 + // External Environment: https://beam.apache.org/documentation/runtime/sdk-harness-config/ + + workerInDocker := wk.EnvPb.GetUrn() == urns.EnvDocker || + (wk.EnvPb.GetUrn() == urns.EnvExternal && (os.Getenv("BEAM_WORKER_POOL_IN_DOCKER_VM") == "1")) + if runtime.GOOS != "linux" && workerInDocker && strings.HasPrefix(endpoint, "localhost:") { + return "host.docker.internal:" + strings.TrimPrefix(endpoint, "localhost:") + } + return endpoint +} + +func (wk *W) ResolveEndpoints(artifactEndpoint string) { + wk.ArtifactEndpoint = wk.resolveEndpoint(artifactEndpoint) + wk.endpoint = wk.resolveEndpoint(wk.parentPool.endpoint) +} + func (wk *W) Endpoint() string { - return wk.parentPool.endpoint + return wk.endpoint } func (wk *W) String() string { diff --git a/sdks/java/extensions/sql/build.gradle b/sdks/java/extensions/sql/build.gradle index af8b6cba1742..5527493200f7 100644 --- a/sdks/java/extensions/sql/build.gradle +++ b/sdks/java/extensions/sql/build.gradle @@ -92,6 +92,9 @@ dependencies { implementation "org.codehaus.janino:commons-compiler:3.0.11" implementation library.java.jackson_core implementation library.java.mongo_java_driver + permitUnusedDeclared library.java.mongo_java_driver + implementation library.java.mongo_bson + implementation library.java.mongodb_driver_core implementation library.java.slf4j_api implementation library.java.joda_time implementation library.java.vendored_guava_32_1_2_jre @@ -131,6 +134,7 @@ dependencies { testImplementation library.java.kafka_clients testImplementation project(":sdks:java:io:kafka") testImplementation project(path: ":sdks:java:io:mongodb", configuration: "testRuntimeMigration") + testImplementation library.java.mongo_java_driver testImplementation project(path: ":sdks:java:io:thrift", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:extensions:protobuf", configuration: "testRuntimeMigration") testCompileOnly project(":sdks:java:extensions:sql:udf-test-provider") diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/mongodb/MongoDbReadWriteIT.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/mongodb/MongoDbReadWriteIT.java index 76be08fe9a6e..804639cacfc3 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/mongodb/MongoDbReadWriteIT.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/mongodb/MongoDbReadWriteIT.java @@ -31,7 +31,8 @@ import static org.hamcrest.core.IsInstanceOf.instanceOf; import com.mongodb.BasicDBObject; -import com.mongodb.MongoClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import com.mongodb.client.model.Filters; @@ -128,14 +129,14 @@ public static void setUp() throws Exception { .build(); mongodExecutable = mongodStarter.prepare(mongodConfig); mongodProcess = mongodExecutable.start(); - client = new MongoClient(hostname, port); + client = MongoClients.create("mongodb://" + hostname + ":" + port); mongoSqlUrl = String.format("mongodb://%s:%d/%s/%s", hostname, port, database, collection); } @AfterClass public static void tearDown() throws Exception { - client.dropDatabase(database); + client.getDatabase(database).drop(); client.close(); mongodProcess.stop(); mongodExecutable.stop(); diff --git a/sdks/java/io/mongodb/build.gradle b/sdks/java/io/mongodb/build.gradle index b9e90082f0dc..56d29750dead 100644 --- a/sdks/java/io/mongodb/build.gradle +++ b/sdks/java/io/mongodb/build.gradle @@ -28,13 +28,14 @@ dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") implementation library.java.joda_time implementation library.java.mongo_java_driver + implementation library.java.mongo_bson + implementation library.java.mongodb_driver_core implementation library.java.slf4j_api implementation library.java.vendored_guava_32_1_2_jre testImplementation library.java.junit testImplementation project(path: ":sdks:java:io:common") testImplementation project(path: ":sdks:java:testing:test-utils") - testImplementation "de.flapdoodle.embed:de.flapdoodle.embed.mongo:3.0.0" - testImplementation "de.flapdoodle.embed:de.flapdoodle.embed.process:3.0.0" + testImplementation "de.flapdoodle.embed:de.flapdoodle.embed.mongo:3.5.4" testRuntimeOnly library.java.slf4j_jdk14 testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") } diff --git a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/FindQuery.java b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/FindQuery.java index 2131656d458a..d89db9dea54b 100644 --- a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/FindQuery.java +++ b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/FindQuery.java @@ -21,7 +21,7 @@ import com.google.auto.value.AutoValue; import com.mongodb.BasicDBObject; -import com.mongodb.MongoClient; +import com.mongodb.MongoClientSettings; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.model.Projections; @@ -79,7 +79,8 @@ private FindQuery withFilters(BsonDocument filters) { /** Convert the Bson filters into a BsonDocument via default encoding. */ static BsonDocument bson2BsonDocument(Bson filters) { - return filters.toBsonDocument(BasicDBObject.class, MongoClient.getDefaultCodecRegistry()); + return filters.toBsonDocument( + BasicDBObject.class, MongoClientSettings.getDefaultCodecRegistry()); } /** Sets the filters to find. */ diff --git a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbGridFSIO.java b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbGridFSIO.java index 07cc238c7e6b..71f8b291e0d5 100644 --- a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbGridFSIO.java +++ b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbGridFSIO.java @@ -21,15 +21,18 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; import com.google.auto.value.AutoValue; -import com.mongodb.DB; -import com.mongodb.DBCursor; -import com.mongodb.DBObject; -import com.mongodb.MongoClient; -import com.mongodb.MongoClientURI; -import com.mongodb.gridfs.GridFS; -import com.mongodb.gridfs.GridFSDBFile; -import com.mongodb.gridfs.GridFSInputFile; -import com.mongodb.util.JSON; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCursor; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.gridfs.GridFSBucket; +import com.mongodb.client.gridfs.GridFSBuckets; +import com.mongodb.client.gridfs.GridFSDownloadStream; +import com.mongodb.client.gridfs.GridFSUploadStream; +import com.mongodb.client.gridfs.model.GridFSFile; +import com.mongodb.client.gridfs.model.GridFSUploadOptions; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -53,6 +56,7 @@ import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; +import org.bson.Document; import org.bson.types.ObjectId; import org.checkerframework.checker.nullness.qual.Nullable; import org.checkerframework.dataflow.qual.Pure; @@ -117,16 +121,18 @@ public class MongoDbGridFSIO { /** Callback for the parser to use to submit data. */ public interface ParserCallback extends Serializable { - /** Output the object. The default timestamp will be the GridFSDBFile creation timestamp. */ + /** Output the object. The default timestamp will be the GridFSFile creation timestamp. */ void output(T output); /** Output the object using the specified timestamp. */ void output(T output, Instant timestamp); } - /** Interface for the parser that is used to parse the GridFSDBFile into the appropriate types. */ + /** Interface for the parser that is used to parse the GridFSFile into the appropriate types. */ public interface Parser extends Serializable { - void parse(GridFSDBFile input, ParserCallback callback) throws IOException; + void parse( + GridFSFile gridFSFile, GridFSDownloadStream downloadStream, ParserCallback callback) + throws IOException; } /** @@ -134,11 +140,10 @@ public interface Parser extends Serializable { * file into Strings. It uses the timestamp of the file for the event timestamp. */ private static final Parser TEXT_PARSER = - (input, callback) -> { - final Instant time = new Instant(input.getUploadDate().getTime()); + (gridFSFile, downloadStream, callback) -> { + final Instant time = new Instant(gridFSFile.getUploadDate().getTime()); try (BufferedReader reader = - new BufferedReader( - new InputStreamReader(input.getInputStream(), StandardCharsets.UTF_8))) { + new BufferedReader(new InputStreamReader(downloadStream, StandardCharsets.UTF_8))) { for (String line = reader.readLine(); line != null; line = reader.readLine()) { callback.output(line, time); } @@ -197,12 +202,20 @@ static ConnectionConfiguration create( } MongoClient setupMongo() { - return uri() == null ? new MongoClient() : new MongoClient(new MongoClientURI(uri())); + if (uri() == null) { + return MongoClients.create(); + } + MongoClientSettings settings = + MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(Preconditions.checkStateNotNull(uri()))) + .build(); + return MongoClients.create(settings); } - GridFS setupGridFS(MongoClient mongo) { - DB db = database() == null ? mongo.getDB("gridfs") : mongo.getDB(database()); - return bucket() == null ? new GridFS(db) : new GridFS(db, bucket()); + GridFSBucket setupGridFS(MongoClient mongo) { + MongoDatabase db = + database() == null ? mongo.getDatabase("gridfs") : mongo.getDatabase(database()); + return bucket() == null ? GridFSBuckets.create(db) : GridFSBuckets.create(db, bucket()); } } @@ -313,12 +326,12 @@ public PCollection expand(PBegin input) { ParDo.of( new DoFn() { @Nullable MongoClient mongo; - @Nullable GridFS gridfs; + @Nullable GridFSBucket gridFSBucket; @Setup public void setup() { mongo = source.spec.connectionConfiguration().setupMongo(); - gridfs = source.spec.connectionConfiguration().setupGridFS(mongo); + gridFSBucket = source.spec.connectionConfiguration().setupGridFS(mongo); } @Teardown @@ -331,12 +344,18 @@ public void teardown() { @ProcessElement public void processElement(final ProcessContext c) throws IOException { - Preconditions.checkStateNotNull(gridfs); + GridFSBucket bucket = Preconditions.checkStateNotNull(gridFSBucket); ObjectId oid = c.element(); - GridFSDBFile file = gridfs.find(oid); + GridFSDownloadStream downloadStream = bucket.openDownloadStream(oid); + GridFSFile gridFSFile = + bucket.find(com.mongodb.client.model.Filters.eq("_id", oid)).first(); + if (gridFSFile == null) { + return; // Skip if file not found + } Parser parser = Preconditions.checkStateNotNull(parser()); parser.parse( - file, + gridFSFile, + downloadStream, new ParserCallback() { @Override public void output(T output, Instant timestamp) { @@ -378,12 +397,12 @@ protected static class BoundedGridFSSource extends BoundedSource { this.objectIds = objectIds; } - private DBCursor createCursor(GridFS gridfs) { + private MongoCursor createCursor(GridFSBucket gridFSBucket) { if (spec.filter() != null) { - DBObject query = (DBObject) JSON.parse(spec.filter()); - return gridfs.getFileList(query); + Document query = Document.parse(spec.filter()); + return gridFSBucket.find(query).iterator(); } - return gridfs.getFileList(); + return gridFSBucket.find().iterator(); } @Override @@ -391,20 +410,20 @@ public List> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { MongoClient mongo = spec.connectionConfiguration().setupMongo(); try { - GridFS gridfs = spec.connectionConfiguration().setupGridFS(mongo); - DBCursor cursor = createCursor(gridfs); + GridFSBucket gridFSBucket = spec.connectionConfiguration().setupGridFS(mongo); + MongoCursor cursor = createCursor(gridFSBucket); long size = 0; List list = new ArrayList<>(); List objects = new ArrayList<>(); while (cursor.hasNext()) { - GridFSDBFile file = (GridFSDBFile) cursor.next(); + GridFSFile file = cursor.next(); long len = file.getLength(); if ((size + len) > desiredBundleSizeBytes && !objects.isEmpty()) { list.add(new BoundedGridFSSource(spec, objects)); size = 0; objects = new ArrayList<>(); } - objects.add((ObjectId) file.getId()); + objects.add(file.getObjectId()); size += len; } if (!objects.isEmpty() || list.isEmpty()) { @@ -419,10 +438,11 @@ public List> split( @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { try (MongoClient mongo = spec.connectionConfiguration().setupMongo(); - DBCursor cursor = createCursor(spec.connectionConfiguration().setupGridFS(mongo))) { + MongoCursor cursor = + createCursor(spec.connectionConfiguration().setupGridFS(mongo))) { long size = 0; while (cursor.hasNext()) { - GridFSDBFile file = (GridFSDBFile) cursor.next(); + GridFSFile file = cursor.next(); size += file.getLength(); } return size; @@ -456,7 +476,7 @@ static class GridFSReader extends BoundedSource.BoundedReader { final @Nullable List objects; @Nullable MongoClient mongo; - @Nullable DBCursor cursor; + @Nullable MongoCursor cursor; @Nullable Iterator iterator; @Nullable ObjectId current; @@ -474,8 +494,8 @@ public BoundedSource getCurrentSource() { public boolean start() throws IOException { if (objects == null) { mongo = source.spec.connectionConfiguration().setupMongo(); - GridFS gridfs = source.spec.connectionConfiguration().setupGridFS(mongo); - cursor = source.createCursor(gridfs); + GridFSBucket gridFSBucket = source.spec.connectionConfiguration().setupGridFS(mongo); + cursor = source.createCursor(gridFSBucket); } else { iterator = objects.iterator(); } @@ -488,8 +508,8 @@ public boolean advance() throws IOException { current = iterator.next(); return true; } else if (cursor != null && cursor.hasNext()) { - GridFSDBFile file = (GridFSDBFile) cursor.next(); - current = (ObjectId) file.getId(); + GridFSFile file = cursor.next(); + current = file.getObjectId(); return true; } current = null; @@ -628,9 +648,9 @@ private static class GridFsWriteFn extends DoFn { private final Write spec; private transient @Nullable MongoClient mongo; - private transient @Nullable GridFS gridfs; + private transient @Nullable GridFSBucket gridFSBucket; - private transient @Nullable GridFSInputFile gridFsFile; + private transient @Nullable GridFSUploadStream gridFsUploadStream; private transient @Nullable OutputStream outputStream; public GridFsWriteFn(Write spec) { @@ -640,20 +660,22 @@ public GridFsWriteFn(Write spec) { @Setup public void setup() throws Exception { mongo = spec.connectionConfiguration().setupMongo(); - gridfs = spec.connectionConfiguration().setupGridFS(mongo); + gridFSBucket = spec.connectionConfiguration().setupGridFS(mongo); } @StartBundle public void startBundle() { - GridFS gridfs = Preconditions.checkStateNotNull(this.gridfs); + GridFSBucket gridFSBucket = Preconditions.checkStateNotNull(this.gridFSBucket); String filename = Preconditions.checkStateNotNull(spec.filename()); - GridFSInputFile gridFsFile = gridfs.createFile(filename); + if (spec.chunkSize() != null) { - gridFsFile.setChunkSize(spec.chunkSize()); + gridFsUploadStream = + gridFSBucket.openUploadStream( + filename, new GridFSUploadOptions().chunkSizeBytes(spec.chunkSize().intValue())); + } else { + gridFsUploadStream = gridFSBucket.openUploadStream(filename); } - outputStream = gridFsFile.getOutputStream(); - - this.gridFsFile = gridFsFile; + outputStream = gridFsUploadStream; } @ProcessElement @@ -665,35 +687,20 @@ public void processElement(ProcessContext context) throws Exception { @FinishBundle public void finishBundle() throws Exception { - if (outputStream != null) { - OutputStream outputStream = this.outputStream; - outputStream.flush(); - outputStream.close(); - this.outputStream = null; - } - if (gridFsFile != null) { - gridFsFile = null; + GridFSUploadStream uploadStream = gridFsUploadStream; + if (uploadStream != null) { + uploadStream.flush(); + uploadStream.close(); + gridFsUploadStream = null; + outputStream = null; } } @Teardown public void teardown() throws Exception { - try { - if (outputStream != null) { - OutputStream outputStream = this.outputStream; - outputStream.flush(); - outputStream.close(); - this.outputStream = null; - } - if (gridFsFile != null) { - gridFsFile = null; - } - } finally { - if (mongo != null) { - mongo.close(); - mongo = null; - gridfs = null; - } + if (mongo != null) { + mongo.close(); + mongo = null; } } } diff --git a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java index 905c7418e26c..1283e873f2b6 100644 --- a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java +++ b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/MongoDbIO.java @@ -22,12 +22,14 @@ import com.google.auto.value.AutoValue; import com.mongodb.BasicDBObject; +import com.mongodb.ConnectionString; import com.mongodb.MongoBulkWriteException; -import com.mongodb.MongoClient; -import com.mongodb.MongoClientOptions; -import com.mongodb.MongoClientURI; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoClientSettings.Builder; import com.mongodb.MongoCommandException; import com.mongodb.client.AggregateIterable; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; @@ -46,6 +48,7 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import javax.net.ssl.SSLContext; import org.apache.beam.sdk.coders.Coder; @@ -64,6 +67,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.bson.BsonDocument; import org.bson.BsonInt32; +import org.bson.BsonObjectId; import org.bson.BsonString; import org.bson.Document; import org.bson.conversions.Bson; @@ -362,22 +366,25 @@ public void populateDisplayData(DisplayData.Builder builder) { } } - private static MongoClientOptions.Builder getOptions( + private static MongoClientSettings.Builder getOptions( int maxConnectionIdleTime, boolean sslEnabled, boolean sslInvalidHostNameAllowed, boolean ignoreSSLCertificate) { - MongoClientOptions.Builder optionsBuilder = new MongoClientOptions.Builder(); - optionsBuilder.maxConnectionIdleTime(maxConnectionIdleTime); + MongoClientSettings.Builder settingsBuilder = MongoClientSettings.builder(); + settingsBuilder.applyToConnectionPoolSettings( + builder -> builder.maxConnectionIdleTime(maxConnectionIdleTime, TimeUnit.MILLISECONDS)); if (sslEnabled) { - optionsBuilder.sslEnabled(sslEnabled).sslInvalidHostNameAllowed(sslInvalidHostNameAllowed); - if (ignoreSSLCertificate) { - SSLContext sslContext = SSLUtils.ignoreSSLCertificate(); - optionsBuilder.sslContext(sslContext); - optionsBuilder.socketFactory(sslContext.getSocketFactory()); - } + settingsBuilder.applyToSslSettings( + builder -> { + builder.enabled(sslEnabled).invalidHostNameAllowed(sslInvalidHostNameAllowed); + if (ignoreSSLCertificate) { + SSLContext sslContext = SSLUtils.ignoreSSLCertificate(); + builder.context(sslContext); + } + }); } - return optionsBuilder; + return settingsBuilder; } /** A MongoDB {@link BoundedSource} reading {@link Document} from a given instance. */ @@ -414,15 +421,15 @@ long getDocumentCount() { String uri = Preconditions.checkStateNotNull(spec.uri()); String database = Preconditions.checkStateNotNull(spec.database()); String collection = Preconditions.checkStateNotNull(spec.collection()); - try (MongoClient mongoClient = - new MongoClient( - new MongoClientURI( - uri, - getOptions( - spec.maxConnectionIdleTime(), - spec.sslEnabled(), - spec.sslInvalidHostNameAllowed(), - spec.ignoreSSLCertificate())))) { + MongoClientSettings settings = + getOptions( + spec.maxConnectionIdleTime(), + spec.sslEnabled(), + spec.sslInvalidHostNameAllowed(), + spec.ignoreSSLCertificate()) + .applyConnectionString(new ConnectionString(uri)) + .build(); + try (MongoClient mongoClient = MongoClients.create(settings)) { return getDocumentCount(mongoClient, database, collection); } catch (Exception e) { return -1; @@ -446,15 +453,15 @@ public long getEstimatedSizeBytes(PipelineOptions pipelineOptions) { String uri = Preconditions.checkStateNotNull(spec.uri()); String database = Preconditions.checkStateNotNull(spec.database()); String collection = Preconditions.checkStateNotNull(spec.collection()); - try (MongoClient mongoClient = - new MongoClient( - new MongoClientURI( - uri, - getOptions( - spec.maxConnectionIdleTime(), - spec.sslEnabled(), - spec.sslInvalidHostNameAllowed(), - spec.ignoreSSLCertificate())))) { + MongoClientSettings settings = + getOptions( + spec.maxConnectionIdleTime(), + spec.sslEnabled(), + spec.sslInvalidHostNameAllowed(), + spec.ignoreSSLCertificate()) + .applyConnectionString(new ConnectionString(uri)) + .build(); + try (MongoClient mongoClient = MongoClients.create(settings)) { try { return getEstimatedSizeBytes(mongoClient, database, collection); } catch (MongoCommandException exception) { @@ -483,15 +490,15 @@ public List> split( String uri = Preconditions.checkStateNotNull(spec.uri()); String database = Preconditions.checkStateNotNull(spec.database()); String collection = Preconditions.checkStateNotNull(spec.collection()); - try (MongoClient mongoClient = - new MongoClient( - new MongoClientURI( - uri, - getOptions( - spec.maxConnectionIdleTime(), - spec.sslEnabled(), - spec.sslInvalidHostNameAllowed(), - spec.ignoreSSLCertificate())))) { + MongoClientSettings settings = + getOptions( + spec.maxConnectionIdleTime(), + spec.sslEnabled(), + spec.sslInvalidHostNameAllowed(), + spec.ignoreSSLCertificate()) + .applyConnectionString(new ConnectionString(uri)) + .build(); + try (MongoClient mongoClient = MongoClients.create(settings)) { MongoDatabase mongoDatabase = mongoClient.getDatabase(database); List splitKeys; @@ -671,26 +678,39 @@ static List splitKeysToMatch(List splitKeys) { if (i == 0) { aggregates.add(Aggregates.match(Filters.lte("_id", splitKey))); if (splitKeys.size() == 1) { - aggregates.add(Aggregates.match(Filters.and(Filters.gt("_id", splitKey)))); + aggregates.add(Aggregates.match(Filters.gt("_id", splitKey))); } } else if (i == splitKeys.size() - 1) { // this is the last split in the list, the filters define // the range from the previous split to the current split and also // the current split to the end - aggregates.add( - Aggregates.match( - Filters.and(Filters.gt("_id", lowestBound), Filters.lte("_id", splitKey)))); - aggregates.add(Aggregates.match(Filters.and(Filters.gt("_id", splitKey)))); + // Create a custom BSON document with multiple conditions on the same field + BsonDocument rangeFilter = + new BsonDocument( + "_id", + new BsonDocument( + "$gt", new BsonObjectId(Preconditions.checkStateNotNull(lowestBound))) + .append("$lte", new BsonObjectId(splitKey))); + aggregates.add(Aggregates.match(rangeFilter)); + aggregates.add(Aggregates.match(Filters.gt("_id", splitKey))); } else { - aggregates.add( - Aggregates.match( - Filters.and(Filters.gt("_id", lowestBound), Filters.lte("_id", splitKey)))); + // Create a custom BSON document with multiple conditions on the same field + BsonDocument rangeFilter = + new BsonDocument( + "_id", + new BsonDocument( + "$gt", new BsonObjectId(Preconditions.checkStateNotNull(lowestBound))) + .append("$lte", new BsonObjectId(splitKey))); + aggregates.add(Aggregates.match(rangeFilter)); } lowestBound = splitKey; } return aggregates.stream() - .map(s -> s.toBsonDocument(BasicDBObject.class, MongoClient.getDefaultCodecRegistry())) + .map( + s -> + s.toBsonDocument( + BasicDBObject.class, MongoClientSettings.getDefaultCodecRegistry())) .collect(Collectors.toList()); } @@ -786,14 +806,15 @@ public void close() { private MongoClient createClient(Read spec) { String uri = Preconditions.checkStateNotNull(spec.uri(), "withUri() is required"); - return new MongoClient( - new MongoClientURI( - uri, - getOptions( + MongoClientSettings settings = + getOptions( spec.maxConnectionIdleTime(), spec.sslEnabled(), spec.sslInvalidHostNameAllowed(), - spec.ignoreSSLCertificate()))); + spec.ignoreSSLCertificate()) + .applyConnectionString(new ConnectionString(uri)) + .build(); + return MongoClients.create(settings); } } @@ -985,15 +1006,15 @@ static class WriteFn extends DoFn { @Setup public void createMongoClient() { String uri = Preconditions.checkStateNotNull(spec.uri()); - client = - new MongoClient( - new MongoClientURI( - uri, - getOptions( - spec.maxConnectionIdleTime(), - spec.sslEnabled(), - spec.sslInvalidHostNameAllowed(), - spec.ignoreSSLCertificate()))); + MongoClientSettings settings = + getOptions( + spec.maxConnectionIdleTime(), + spec.sslEnabled(), + spec.sslInvalidHostNameAllowed(), + spec.ignoreSSLCertificate()) + .applyConnectionString(new ConnectionString(uri)) + .build(); + client = MongoClients.create(settings); } @StartBundle diff --git a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/FindQueryTest.java b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/FindQueryTest.java index df66179f3904..da90f92dc190 100644 --- a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/FindQueryTest.java +++ b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/FindQueryTest.java @@ -21,7 +21,7 @@ import com.google.auto.value.AutoValue; import com.mongodb.BasicDBObject; -import com.mongodb.MongoClient; +import com.mongodb.MongoClientSettings; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.model.Projections; @@ -79,7 +79,8 @@ private FindQueryTest withFilters(BsonDocument filters) { /** Convert the Bson filters into a BsonDocument via default encoding. */ static BsonDocument bson2BsonDocument(Bson filters) { - return filters.toBsonDocument(BasicDBObject.class, MongoClient.getDefaultCodecRegistry()); + return filters.toBsonDocument( + BasicDBObject.class, MongoClientSettings.getDefaultCodecRegistry()); } /** Sets the filters to find. */ diff --git a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDBGridFSIOTest.java b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDBGridFSIOTest.java index 09343606f228..d13185a08fb6 100644 --- a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDBGridFSIOTest.java +++ b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDBGridFSIOTest.java @@ -20,11 +20,13 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import com.mongodb.DB; -import com.mongodb.MongoClient; -import com.mongodb.gridfs.GridFS; -import com.mongodb.gridfs.GridFSDBFile; -import com.mongodb.gridfs.GridFSInputFile; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.gridfs.GridFSBucket; +import com.mongodb.client.gridfs.GridFSBuckets; +import com.mongodb.client.gridfs.GridFSUploadStream; +import com.mongodb.client.gridfs.model.GridFSFile; import de.flapdoodle.embed.mongo.MongodExecutable; import de.flapdoodle.embed.mongo.MongodProcess; import de.flapdoodle.embed.mongo.MongodStarter; @@ -35,12 +37,10 @@ import de.flapdoodle.embed.mongo.distribution.Version; import de.flapdoodle.embed.process.runtime.Network; import java.io.BufferedReader; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -117,9 +117,9 @@ public static void start() throws Exception { LOG.info("Insert test data"); - MongoClient client = new MongoClient("localhost", port); - DB database = client.getDB(DATABASE); - GridFS gridfs = new GridFS(database); + MongoClient client = MongoClients.create("mongodb://localhost:" + port); + MongoDatabase database = client.getDatabase(DATABASE); + GridFSBucket gridfs = GridFSBuckets.create(database); ByteArrayOutputStream out = new ByteArrayOutputStream(); for (int x = 0; x < 100; x++) { @@ -129,10 +129,12 @@ public static void start() throws Exception { .getBytes(StandardCharsets.UTF_8)); } for (int x = 0; x < 5; x++) { - gridfs.createFile(new ByteArrayInputStream(out.toByteArray()), "file" + x).save(); + try (GridFSUploadStream uploadStream = gridfs.openUploadStream("file" + x)) { + uploadStream.write(out.toByteArray()); + } } - gridfs = new GridFS(database, "mapBucket"); + GridFSBucket mapBucketGridfs = GridFSBuckets.create(database, "mapBucket"); long now = System.currentTimeMillis(); Random random = new Random(); String[] scientists = { @@ -148,26 +150,25 @@ public static void start() throws Exception { "Maxwell" }; for (int x = 0; x < 10; x++) { - GridFSInputFile file = gridfs.createFile("file_" + x); - OutputStream outf = file.getOutputStream(); - OutputStreamWriter writer = new OutputStreamWriter(outf, StandardCharsets.UTF_8); - for (int y = 0; y < 5000; y++) { - long time = now - random.nextInt(3600000); - String name = scientists[y % scientists.length]; - writer.write(time + "\t"); - writer.write(name + "\t"); - writer.write(Integer.toString(random.nextInt(100))); - writer.write("\n"); - } - for (int y = 0; y < scientists.length; y++) { - String name = scientists[y % scientists.length]; - writer.write(now + "\t"); - writer.write(name + "\t"); - writer.write("101"); - writer.write("\n"); + try (GridFSUploadStream uploadStream = mapBucketGridfs.openUploadStream("file_" + x)) { + OutputStreamWriter writer = new OutputStreamWriter(uploadStream, StandardCharsets.UTF_8); + for (int y = 0; y < 5000; y++) { + long time = now - random.nextInt(3600000); + String name = scientists[y % scientists.length]; + writer.write(time + "\t"); + writer.write(name + "\t"); + writer.write(Integer.toString(random.nextInt(100))); + writer.write("\n"); + } + for (int y = 0; y < scientists.length; y++) { + String name = scientists[y % scientists.length]; + writer.write(now + "\t"); + writer.write(name + "\t"); + writer.write("101"); + writer.write("\n"); + } + writer.flush(); } - writer.flush(); - writer.close(); } client.close(); } @@ -208,11 +209,10 @@ public void testReadWithParser() { .withDatabase(DATABASE) .withBucket("mapBucket") .>withParser( - (input, callback) -> { + (gridFSFile, downloadStream, callback) -> { try (final BufferedReader reader = new BufferedReader( - new InputStreamReader( - input.getInputStream(), StandardCharsets.UTF_8))) { + new InputStreamReader(downloadStream, StandardCharsets.UTF_8))) { String line = reader.readLine(); while (line != null) { try (Scanner scanner = new Scanner(line.trim())) { @@ -311,19 +311,20 @@ public void testWriteMessage() throws Exception { MongoClient client = null; try { StringBuilder results = new StringBuilder(); - client = new MongoClient("localhost", port); - DB database = client.getDB(DATABASE); - GridFS gridfs = new GridFS(database, "WriteTest"); - List files = gridfs.find("WriteTestData"); - assertTrue(files.size() > 0); - for (GridFSDBFile file : files) { - assertEquals(100, file.getChunkSize()); - int l = (int) file.getLength(); - try (InputStream ins = file.getInputStream()) { - DataInputStream dis = new DataInputStream(ins); - byte[] b = new byte[l]; - dis.readFully(b); - results.append(new String(b, StandardCharsets.UTF_8)); + client = MongoClients.create("mongodb://localhost:" + port); + MongoDatabase database = client.getDatabase(DATABASE); + GridFSBucket gridfs = GridFSBuckets.create(database, "WriteTest"); + + for (GridFSFile file : gridfs.find()) { + if (file.getFilename().equals("WriteTestData")) { + assertEquals(100, file.getChunkSize()); + int l = (int) file.getLength(); + try (InputStream ins = gridfs.openDownloadStream(file.getObjectId())) { + DataInputStream dis = new DataInputStream(ins); + byte[] b = new byte[l]; + dis.readFully(b); + results.append(new String(b, StandardCharsets.UTF_8)); + } } } String dataString = results.toString(); @@ -331,16 +332,17 @@ public void testWriteMessage() throws Exception { assertTrue(dataString.contains("Message " + x)); } - files = gridfs.find("WriteTestIntData"); boolean[] intResults = new boolean[100]; - for (GridFSDBFile file : files) { - int l = (int) file.getLength(); - try (InputStream ins = file.getInputStream()) { - DataInputStream dis = new DataInputStream(ins); - byte[] b = new byte[l]; - dis.readFully(b); - for (byte aB : b) { - intResults[aB] = true; + for (GridFSFile file : gridfs.find()) { + if (file.getFilename().equals("WriteTestIntData")) { + int l = (int) file.getLength(); + try (InputStream ins = gridfs.openDownloadStream(file.getObjectId())) { + DataInputStream dis = new DataInputStream(ins); + byte[] b = new byte[l]; + dis.readFully(b); + for (byte aB : b) { + intResults[aB] = true; + } } } } diff --git a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java index 4dda988e355c..cc85db937975 100644 --- a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java +++ b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/MongoDbIOTest.java @@ -21,7 +21,8 @@ import static org.hamcrest.Matchers.greaterThan; import static org.junit.Assert.assertEquals; -import com.mongodb.MongoClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import com.mongodb.client.model.Filters; @@ -107,7 +108,7 @@ public static void beforeClass() throws Exception { .build(); mongodExecutable = mongodStarter.prepare(mongodConfig); mongodProcess = mongodExecutable.start(); - client = new MongoClient("localhost", port); + client = MongoClients.create("mongodb://localhost:" + port); database = client.getDatabase(DATABASE_NAME); LOG.info("Insert test data"); diff --git a/sdks/python/apache_beam/internal/code_object_pickler_test.py b/sdks/python/apache_beam/internal/code_object_pickler_test.py index 2060533e9328..de01f16fd0a7 100644 --- a/sdks/python/apache_beam/internal/code_object_pickler_test.py +++ b/sdks/python/apache_beam/internal/code_object_pickler_test.py @@ -126,30 +126,27 @@ def get_lambda_from_dictionary(): return get_lambda_from_dictionary() +prefix = __name__ + test_cases = [ + (top_level_function, f"{prefix}.top_level_function" + ".__code__"), + (top_level_lambda, f"{prefix}.top_level_lambda" + ".__code__"), ( - top_level_function, - "apache_beam.internal.code_object_pickler_test.top_level_function" - ".__code__"), - ( - top_level_lambda, - "apache_beam.internal.code_object_pickler_test.top_level_lambda" - ".__code__"), - ( - get_nested_function(), - ( - "apache_beam.internal.code_object_pickler_test.get_nested_function" + get_nested_function(), ( + f"{prefix}.get_nested_function" ".__code__.co_consts[nested_function]")), ( get_lambda_from_dictionary(), ( - "apache_beam.internal.code_object_pickler_test" + f"{prefix}" ".get_lambda_from_dictionary.__code__.co_consts[, ('x',)]") ), ( get_lambda_from_dictionary_same_args(), ( - "apache_beam.internal.code_object_pickler_test" + f"{prefix}" ".get_lambda_from_dictionary_same_args.__code__.co_consts" "[, ('x',), " + hashlib.md5( get_lambda_from_dictionary_same_args().__code__.co_code). @@ -157,52 +154,46 @@ def get_lambda_from_dictionary(): ( function_with_lambda_default_argument(), ( - "apache_beam.internal.code_object_pickler_test" + f"{prefix}" ".function_with_lambda_default_argument.__defaults__[0].__code__")), ( function_with_function_default_argument(), - "apache_beam.internal.code_object_pickler_test.top_level_function" + f"{prefix}.top_level_function" ".__code__"), - ( - add_one, - "apache_beam.internal.code_object_pickler_test.function_decorator" - ".__code__.co_consts[]"), + (add_one, f"{prefix}.function_decorator" + ".__code__.co_consts[]"), ( ClassWithFunction.process, - "apache_beam.internal.code_object_pickler_test.ClassWithFunction" + f"{prefix}.ClassWithFunction" ".process.__code__"), ( ClassWithStaticMethod.static_method, - "apache_beam.internal.code_object_pickler_test.ClassWithStaticMethod" + f"{prefix}.ClassWithStaticMethod" ".static_method.__code__"), ( ClassWithClassMethod.class_method, - "apache_beam.internal.code_object_pickler_test.ClassWithClassMethod" + f"{prefix}.ClassWithClassMethod" ".class_method.__code__"), ( ClassWithNestedFunction().process(), ( - "apache_beam.internal.code_object_pickler_test" - ".ClassWithNestedFunction.process.__code__.co_consts" + f"{prefix}.ClassWithNestedFunction.process.__code__.co_consts" "[nested_function]")), ( ClassWithLambda().process(), - "apache_beam.internal.code_object_pickler_test.ClassWithLambda.process" - ".__code__.co_consts[]"), + f"{prefix}.ClassWithLambda.process.__code__.co_consts[]"), ( ClassWithNestedClass.InnerClass().process, - "apache_beam.internal.code_object_pickler_test.ClassWithNestedClass" - ".InnerClass.process.__code__"), + f"{prefix}.ClassWithNestedClass.InnerClass.process.__code__"), ( ClassWithNestedLambda().process(), ( - "apache_beam.internal.code_object_pickler_test" + f"{prefix}" ".ClassWithNestedLambda.process.__code__.co_consts" "[get_lambda_from_dictionary].co_consts[, ('x',)]")), ( ClassWithNestedLambda.process, - "apache_beam.internal.code_object_pickler_test.ClassWithNestedLambda" - ".process.__code__"), + f"{prefix}.ClassWithNestedLambda.process.__code__"), ] @@ -225,35 +216,35 @@ def test_roundtrip(self, callable, unused_path): class GetCodeFromCodeObjectIdentifierTest(unittest.TestCase): - def empty_path_raises_exception(self): + def test_empty_path_raises_exception(self): with self.assertRaisesRegex(ValueError, "Path must not be empty"): - code_object_pickler.test_get_code_from_identifier("") + code_object_pickler.get_code_from_identifier("") - def invalid_default_index_raises_exception(self): + def test_invalid_default_index_raises_exception(self): with self.assertRaisesRegex(ValueError, "out of bounds"): - code_object_pickler.test_get_code_from_identifier( - "apache_beam.internal.test_cases.module_with_default_argument." + code_object_pickler.get_code_from_identifier( + "apache_beam.internal.test_data.module_with_default_argument." "function_with_lambda_default_argument.__defaults__[1]") - def invalid_single_name_path_raises_exception(self): + def test_invalid_single_name_path_raises_exception(self): with self.assertRaisesRegex(AttributeError, "Could not find code object with path"): code_object_pickler.get_code_from_identifier( - "apache_beam.internal.test_cases.module_3." + "apache_beam.internal.test_data.module_3." "my_function.__code__.co_consts[something]") - def invalid_lambda_with_args_path_raises_exception(self): + def test_invalid_lambda_with_args_path_raises_exception(self): with self.assertRaisesRegex(AttributeError, "Could not find code object with path"): code_object_pickler.get_code_from_identifier( - "apache_beam.internal.test_cases.module_3." + "apache_beam.internal.test_data.module_3." "my_function.__code__.co_consts[, ('x',)]") - def invalid_lambda_with_hash_path_raises_exception(self): + def test_invalid_lambda_with_hash_path_raises_exception(self): with self.assertRaisesRegex(AttributeError, "Could not find code object with path"): code_object_pickler.get_code_from_identifier( - "apache_beam.internal.test_cases.module_3." + "apache_beam.internal.test_data.module_3." "my_function.__code__.co_consts[, ('',), 1234567890]") def test_adding_local_variable_in_class_preserves_object(self): diff --git a/sdks/python/apache_beam/io/parquetio.py b/sdks/python/apache_beam/io/parquetio.py index fa8b56f916dc..82ae9a50ace4 100644 --- a/sdks/python/apache_beam/io/parquetio.py +++ b/sdks/python/apache_beam/io/parquetio.py @@ -119,7 +119,12 @@ def process(self, row, w=DoFn.WindowParam, pane=DoFn.PaneInfoParam): # reorder the data in columnar format. for i, n in enumerate(self._schema.names): - self._buffer[i].append(row[n]) + # Handle missing nullable fields by using None as default value + field = self._schema.field(i) + if field.nullable and n not in row: + self._buffer[i].append(None) + else: + self._buffer[i].append(row[n]) def finish_bundle(self): if len(self._buffer[0]) > 0: diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index 9371705a1fa3..78d1db4cc7c2 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -59,12 +59,11 @@ try: import pyarrow as pa import pyarrow.parquet as pq + ARROW_MAJOR_VERSION, _, _ = map(int, pa.__version__.split('.')) except ImportError: pa = None - pl = None pq = None - -ARROW_MAJOR_VERSION, _, _ = map(int, pa.__version__.split('.')) + ARROW_MAJOR_VERSION = 0 @unittest.skipIf(pa is None, "PyArrow is not installed.") @@ -422,6 +421,76 @@ def test_schema_read_write(self): | Map(stable_repr)) assert_that(readback, equal_to([stable_repr(r) for r in rows])) + def test_write_with_nullable_fields_missing_data(self): + """Test WriteToParquet with nullable fields where some fields are missing. + + This test addresses the bug reported in: + https://github.com/apache/beam/issues/35791 + where WriteToParquet fails with a KeyError if any nullable + field is missing in the data. + """ + # Define PyArrow schema with all fields nullable + schema = pa.schema([ + pa.field("id", pa.int64(), nullable=True), + pa.field("name", pa.string(), nullable=True), + pa.field("age", pa.int64(), nullable=True), + pa.field("email", pa.string(), nullable=True), + ]) + + # Sample data with missing nullable fields + data = [ + { + 'id': 1, 'name': 'Alice', 'age': 30 + }, # missing 'email' + { + 'id': 2, 'name': 'Bob', 'age': 25, 'email': 'bob@example.com' + }, # all fields present + { + 'id': 3, 'name': 'Charlie', 'age': None, 'email': None + }, # explicit None values + { + 'id': 4, 'name': 'David' + }, # missing 'age' and 'email' + ] + + with TemporaryDirectory() as tmp_dirname: + path = os.path.join(tmp_dirname, 'nullable_test') + + # Write data with missing nullable fields - this should not raise KeyError + with TestPipeline() as p: + _ = ( + p + | Create(data) + | WriteToParquet( + path, schema, num_shards=1, shard_name_template='')) + + # Read back and verify the data + with TestPipeline() as p: + readback = ( + p + | ReadFromParquet(path + '*') + | Map(json.dumps, sort_keys=True)) + + # Expected data should have None for missing nullable fields + expected_data = [ + { + 'id': 1, 'name': 'Alice', 'age': 30, 'email': None + }, + { + 'id': 2, 'name': 'Bob', 'age': 25, 'email': 'bob@example.com' + }, + { + 'id': 3, 'name': 'Charlie', 'age': None, 'email': None + }, + { + 'id': 4, 'name': 'David', 'age': None, 'email': None + }, + ] + + assert_that( + readback, + equal_to([json.dumps(r, sort_keys=True) for r in expected_data])) + def test_batched_read(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py index 58df738c6e5f..a0f597f5366f 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py @@ -27,16 +27,16 @@ from typing import Union from google.protobuf.json_format import MessageToDict - -from apache_beam.ml.rag.types import Chunk -from apache_beam.ml.rag.types import Embedding -from apache_beam.transforms.enrichment import EnrichmentSourceHandler from pymilvus import AnnSearchRequest from pymilvus import Hit from pymilvus import Hits from pymilvus import MilvusClient from pymilvus import SearchResult +from apache_beam.ml.rag.types import Chunk +from apache_beam.ml.rag.types import Embedding +from apache_beam.transforms.enrichment import EnrichmentSourceHandler + class SearchStrategy(Enum): """Search strategies for information retrieval. diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index eed02bb49575..4dabcafe6703 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -481,7 +481,7 @@ class TestMilvusSearchEnrichment(unittest.TestCase): """Tests for search functionality across all search strategies""" _db: MilvusDBContainerInfo - _version = "milvusdb/milvus:v2.3.9" + _version = "milvusdb/milvus:v2.5.10" @classmethod def setUpClass(cls): diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index 00d4624d202d..cf9bf6208dc5 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -34,6 +34,6 @@ # Unreleased sdks use container image tag specified below. # Update this tag whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20250811' +BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20250827' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' diff --git a/sdks/python/apache_beam/yaml/examples/README.md b/sdks/python/apache_beam/yaml/examples/README.md index 4cba973dbead..b053e3e6236d 100644 --- a/sdks/python/apache_beam/yaml/examples/README.md +++ b/sdks/python/apache_beam/yaml/examples/README.md @@ -28,6 +28,7 @@ * [Blueprints](#blueprints) * [Element-wise](#element-wise) * [IO](#io) + * [Jinja](#jinja) * [ML](#ml) @@ -244,6 +245,10 @@ by leveraging Jinja templating engine for dynamic pipeline generation based on inputs from the user through `% include`, `% import`, and inheritance directives. +Jinja `% import` directive: +- [wordCountImport.yaml](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/wordCountImport.yaml) +- [Instructions](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/README.md) on how to run the pipeline. + Jinja `% include` directive: - [wordCountInclude.yaml](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/jinja/include/wordCountInclude.yaml) - [Instructions](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/jinja/include/README.md) on how to run the pipeline. @@ -258,9 +263,9 @@ ML enrichments: Examples that include ML-specific transforms such as `RunInference` and `MLTransform`: -- [streaming_sentiment_analysis.yaml](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/sentiment_analysis/streaming_sentiment_analysis.yaml) -- [streaming_taxifare_prediction.yaml](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/taxi_fare/streaming_taxifare_prediction.yaml) -- [batch_log_analysis.yaml](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/log_analysis/batch_log_analysis.yaml) +- Streaming Sentiment Analysis ([documentation](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples/transforms/ml/sentiment_analysis)) ([pipeline](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/sentiment_analysis/streaming_sentiment_analysis.yaml)) +- Streaming Taxi Fare Prediction ([documentation](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples/transforms/ml/taxi_fare)) ([pipeline](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/taxi_fare/streaming_taxifare_prediction.yaml)) +- Batch Log Analysis ML Workflow ([documentation](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples/transforms/ml/log_analysis)) ([pipeline](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/log_analysis/batch_log_analysis.yaml)) More information can be found about aggregation transforms [here](https://beam.apache.org/documentation/sdks/yaml-combine/). diff --git a/sdks/python/apache_beam/yaml/examples/testing/examples_test.py b/sdks/python/apache_beam/yaml/examples/testing/examples_test.py index 0bfcb3f61612..80e82945523c 100644 --- a/sdks/python/apache_beam/yaml/examples/testing/examples_test.py +++ b/sdks/python/apache_beam/yaml/examples/testing/examples_test.py @@ -353,7 +353,8 @@ def test_yaml_example(self): ] if jinja_preprocessor: jinja_preprocessor = jinja_preprocessor[0] - raw_spec_string = jinja_preprocessor(raw_spec_string) + raw_spec_string = jinja_preprocessor( + raw_spec_string, self._testMethodName) custom_preprocessors.remove(jinja_preprocessor) pipeline_spec = yaml.load( @@ -563,7 +564,7 @@ def _wordcount_minimal_test_preprocessor( @YamlExamplesTestSuite.register_test_preprocessor( - ['test_wordCountInclude_yaml']) + ['test_wordCountInclude_yaml', 'test_wordCountImport_yaml']) def _wordcount_jinja_test_preprocessor( test_spec: dict, expected: List[str], env: TestEnvironment): """ @@ -676,7 +677,8 @@ def _kafka_test_preprocessor( 'test_iceberg_migration_yaml', 'test_ml_preprocessing_yaml', 'test_anomaly_scoring_yaml', - 'test_wordCountInclude_yaml' + 'test_wordCountInclude_yaml', + 'test_wordCountImport_yaml' ]) def _io_write_test_preprocessor( test_spec: dict, expected: List[str], env: TestEnvironment): @@ -1253,8 +1255,8 @@ def _batch_log_analysis_test_preprocessor( @YamlExamplesTestSuite.register_test_preprocessor( - ['test_wordCountInclude_yaml']) -def _jinja_preprocessor(raw_spec_string: str): + ['test_wordCountInclude_yaml', 'test_wordCountImport_yaml']) +def _jinja_preprocessor(raw_spec_string: str, test_name: str): """ Preprocessor for Jinja-based YAML tests. @@ -1274,12 +1276,11 @@ def _jinja_preprocessor(raw_spec_string: str): Returns: A string containing the fully rendered YAML pipeline specification. """ - jinja_variables = json.loads(input_data.word_count_jinja_parameter_data()) test_file_dir = os.path.dirname(__file__) sdk_root = os.path.abspath(os.path.join(test_file_dir, '../../../..')) - include_files = input_data.word_count_jinja_template_data() + include_files = input_data.word_count_jinja_template_data(test_name) mock_templates = {'main_template': raw_spec_string} for file_path in include_files: full_path = os.path.join(sdk_root, file_path) diff --git a/sdks/python/apache_beam/yaml/examples/testing/input_data.py b/sdks/python/apache_beam/yaml/examples/testing/input_data.py index 50d40224f828..fb468567355d 100644 --- a/sdks/python/apache_beam/yaml/examples/testing/input_data.py +++ b/sdks/python/apache_beam/yaml/examples/testing/input_data.py @@ -65,20 +65,28 @@ def word_count_jinja_parameter_data(): return json.dumps(params) -def word_count_jinja_template_data(): - return \ -[('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/readFromTextTransform.yaml'), - ('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/mapToFieldsSplitConfig.yaml'), - ('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/explodeTransform.yaml'), - ('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/combineTransform.yaml'), - ('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/mapToFieldsCountConfig.yaml'), - ('apache_beam/yaml/examples/transforms/jinja/' - 'include/submodules/writeToTextTransform.yaml')] +def word_count_jinja_template_data(test_name: str) -> list[str]: + if test_name == 'test_wordCountInclude_yaml': + return [ + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/readFromTextTransform.yaml', + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/mapToFieldsSplitConfig.yaml', + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/explodeTransform.yaml', + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/combineTransform.yaml', + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/mapToFieldsCountConfig.yaml', + 'apache_beam/yaml/examples/transforms/jinja/' + 'include/submodules/writeToTextTransform.yaml' + ] + elif test_name == 'test_wordCountImport_yaml': + return [ + 'apache_beam/yaml/examples/transforms/jinja/' + 'import/macros/wordCountMacros.yaml' + ] + return [] def iceberg_dynamic_destinations_users_data(): diff --git a/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/README.md b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/README.md new file mode 100644 index 000000000000..14052cd3a6c4 --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/README.md @@ -0,0 +1,63 @@ + + +## Jinja % import Pipeline + +This example leverages the `% import` Jinja directive by having one main +pipeline and then one macros file containing all the transforms and configs +used. + +General setup: +```sh +export PIPELINE_FILE=apache_beam/yaml/examples/transforms/jinja/import/wordCountImport.yaml +export KINGLEAR="gs://dataflow-samples/shakespeare/kinglear.txt" +export TEMP_LOCATION="gs://MY-BUCKET/wordCounts/" + +cd /beam/sdks/python +``` + +Multiline Run Example: +```sh +python -m apache_beam.yaml.main \ + --yaml_pipeline_file="${PIPELINE_FILE}" \ + --jinja_variables='{ + "readFromTextTransform": {"path": "'"${KINGLEAR}"'"}, + "mapToFieldsSplitConfig": { + "language": "python", + "fields": { + "value": "1" + } + }, + "explodeTransform": {"fields": "word"}, + "combineTransform": { + "group_by": "word", + "combine": {"value": "sum"} + }, + "mapToFieldsCountConfig": { + "language": "python", + "fields": {"output": "word + \" - \" + str(value)"} + }, + "writeToTextTransform": {"path": "'"${TEMP_LOCATION}"'"} + }' +``` + +Single Line Run Example: +```sh +python -m apache_beam.yaml.main --yaml_pipeline_file="${PIPELINE_FILE}" --jinja_variables='{"readFromTextTransform": {"path": "gs://dataflow-samples/shakespeare/kinglear.txt"}, "mapToFieldsSplitConfig": {"language": "python", "fields":{"value":"1"}}, "explodeTransform":{"fields":"word"}, "combineTransform":{"group_by":"word", "combine":{"value":"sum"}}, "mapToFieldsCountConfig":{"language": "python", "fields":{"output":"word + \" - \" + str(value)"}}, "writeToTextTransform":{"path":"${TEMP_LOCATION}"}}' +``` diff --git a/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/macros/wordCountMacros.yaml b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/macros/wordCountMacros.yaml new file mode 100644 index 000000000000..b3870693ef5f --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/macros/wordCountMacros.yaml @@ -0,0 +1,64 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{%- macro readFromTextTransform(params) -%} + +- name: Read from GCS + type: ReadFromText + config: + path: "{{ params.path }}" +{%- endmacro -%} + +{%- macro mapToFieldsSplitConfig(params) -%} +language: "{{ params.language }}" +fields: + value: "{{ params.fields.value }}" + word: + callable: |- + import re + def my_mapping(row): + return re.findall(r'[A-Za-z\']+', row.line.lower()) +{%- endmacro -%} + +{%- macro explodeTransform(params) -%} +- name: Explode word arrays + type: Explode + config: + fields: "{{ params.fields }}" +{%- endmacro -%} + +{%- macro combineTransform(params) -%} +- name: Count words + type: Combine + config: + group_by: "{{ params.group_by }}" + combine: + value: "{{ params.combine.value }}" +{%- endmacro -%} + +{%- macro mapToFieldsCountConfig(params) -%} +language: "{{ params.language }}" +fields: + output: '{{ params.fields.output }}' +{%- endmacro -%} + +{%- macro writeToTextTransform(params) -%} +- name: Write to GCS + type: WriteToText + config: + path: "{{ params.path }}" +{%- endmacro -%} diff --git a/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/wordCountImport.yaml b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/wordCountImport.yaml new file mode 100644 index 000000000000..1058a30b607a --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/transforms/jinja/import/wordCountImport.yaml @@ -0,0 +1,69 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This examples reads from a public file stored on Google Cloud. This +# requires authenticating with Google Cloud, or setting the file in +#`ReadFromText` to a local file. +# +# To set up Application Default Credentials, +# see https://cloud.google.com/docs/authentication/external/set-up-adc. +# +# This pipeline reads in a text file, counts distinct words found in the text, +# then logs a row containing each word and its count. + +{% import 'apache_beam/yaml/examples/transforms/jinja/import/macros/wordCountMacros.yaml' as macros %} + +pipeline: + type: chain + transforms: + +# Read in text file +{{ macros.readFromTextTransform(readFromTextTransform) | indent(4, true) }} + +# Split words and count occurrences + - name: Split words + type: MapToFields + config: +{{ macros.mapToFieldsSplitConfig(mapToFieldsSplitConfig) | indent(8, true) }} + +# Explode into individual words +{{ macros.explodeTransform(explodeTransform) | indent(4, true) }} + +# Group by word +{{ macros.combineTransform(combineTransform) | indent(4, true) }} + +# Format output to a single string consisting of `word - count` + - name: Format output + type: MapToFields + config: +{{ macros.mapToFieldsCountConfig(mapToFieldsCountConfig) | indent(8, true) }} + +# Write to text file on GCS, locally, etc +{{ macros.writeToTextTransform(writeToTextTransform) | indent(4, true) }} + +# Expected: +# Row(output='king - 311') +# Row(output='lear - 253') +# Row(output='dramatis - 1') +# Row(output='personae - 1') +# Row(output='of - 483') +# Row(output='britain - 2') +# Row(output='france - 32') +# Row(output='duke - 26') +# Row(output='burgundy - 20') +# Row(output='cornwall - 75') diff --git a/sdks/python/apache_beam/yaml/integration_tests.py b/sdks/python/apache_beam/yaml/integration_tests.py index 38fa2689268e..733dd10d0286 100644 --- a/sdks/python/apache_beam/yaml/integration_tests.py +++ b/sdks/python/apache_beam/yaml/integration_tests.py @@ -33,7 +33,6 @@ from datetime import timezone import mock -import mysql.connector import psycopg2 import pytds import sqlalchemy @@ -286,26 +285,22 @@ def temp_mysql_database(): Exception: Any other exception encountered during the setup process. """ with MySqlContainer(init=True, dialect='pymysql') as mysql_container: - try: - # Make connection to temp database and create tmp table - engine = sqlalchemy.create_engine(mysql_container.get_connection_url()) - with engine.begin() as connection: - connection.execute( - sqlalchemy.text( - "CREATE TABLE tmp_table (value INTEGER, `rank` INTEGER);")) + # Make connection to temp database and create tmp table + engine = sqlalchemy.create_engine(mysql_container.get_connection_url()) + with engine.begin() as connection: + connection.execute( + sqlalchemy.text( + "CREATE TABLE tmp_table (value INTEGER, `rank` INTEGER);")) - # Construct the JDBC url for connections later on by tests - jdbc_url = ( - f"jdbc:mysql://{mysql_container.get_container_host_ip()}:" - f"{mysql_container.get_exposed_port(mysql_container.port)}/" - f"{mysql_container.dbname}?" - f"user={mysql_container.username}&" - f"password={mysql_container.password}") + # Construct the JDBC url for connections later on by tests + jdbc_url = ( + f"jdbc:mysql://{mysql_container.get_container_host_ip()}:" + f"{mysql_container.get_exposed_port(mysql_container.port)}/" + f"{mysql_container.dbname}?" + f"user={mysql_container.username}&" + f"password={mysql_container.password}") - yield jdbc_url - except mysql.connector.Error as err: - logging.error("Error interacting with temporary MySQL DB: %s", err) - raise err + yield jdbc_url @contextlib.contextmanager diff --git a/sdks/python/container/ml/py310/base_image_requirements.txt b/sdks/python/container/ml/py310/base_image_requirements.txt index ba2ee0d85340..a58cc29ff2ec 100644 --- a/sdks/python/container/ml/py310/base_image_requirements.txt +++ b/sdks/python/container/ml/py310/base_image_requirements.txt @@ -67,8 +67,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -85,7 +85,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -98,25 +98,25 @@ guppy3==3.1.5 h11==0.16.0 h5py==3.14.0 hdfs==2.7.3 -hf-xet==1.1.7 +hf-xet==1.1.8 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 -keras==3.11.2 +keras==3.11.3 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -124,16 +124,13 @@ Markdown==3.8.2 markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 ml-dtypes==0.3.2 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 multidict==6.6.4 -mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.4.2 nltk==3.9.1 @@ -183,8 +180,9 @@ pydot==1.4.2 Pygments==2.19.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -198,7 +196,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rich==14.1.0 rpds-py==0.27.0 @@ -231,11 +229,12 @@ tokenizers==0.21.4 tomli==2.2.1 torch==2.7.1 tqdm==4.67.1 -transformers==4.55.2 +transformers==4.55.4 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/ml/py311/base_image_requirements.txt b/sdks/python/container/ml/py311/base_image_requirements.txt index 15a4050ab0f3..d51db46a30da 100644 --- a/sdks/python/container/ml/py311/base_image_requirements.txt +++ b/sdks/python/container/ml/py311/base_image_requirements.txt @@ -65,8 +65,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -83,7 +83,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -96,25 +96,25 @@ guppy3==3.1.5 h11==0.16.0 h5py==3.14.0 hdfs==2.7.3 -hf-xet==1.1.7 +hf-xet==1.1.8 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 -keras==3.11.2 +keras==3.11.3 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -122,16 +122,13 @@ Markdown==3.8.2 markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 ml-dtypes==0.3.2 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 multidict==6.6.4 -mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.5 nltk==3.9.1 @@ -181,8 +178,9 @@ pydot==1.4.2 Pygments==2.19.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -196,7 +194,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rich==14.1.0 rpds-py==0.27.0 @@ -228,11 +226,12 @@ threadpoolctl==3.6.0 tokenizers==0.21.4 torch==2.7.1 tqdm==4.67.1 -transformers==4.55.2 +transformers==4.55.4 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/ml/py312/base_image_requirements.txt b/sdks/python/container/ml/py312/base_image_requirements.txt index 488e4e27f486..f24d50a9a8ae 100644 --- a/sdks/python/container/ml/py312/base_image_requirements.txt +++ b/sdks/python/container/ml/py312/base_image_requirements.txt @@ -64,8 +64,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -82,7 +82,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -95,25 +95,25 @@ guppy3==3.1.5 h11==0.16.0 h5py==3.14.0 hdfs==2.7.3 -hf-xet==1.1.7 +hf-xet==1.1.8 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 -keras==3.11.2 +keras==3.11.3 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -121,16 +121,13 @@ Markdown==3.8.2 markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 ml-dtypes==0.3.2 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 multidict==6.6.4 -mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.5 nltk==3.9.1 @@ -180,8 +177,9 @@ pydot==1.4.2 Pygments==2.19.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -195,7 +193,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rich==14.1.0 rpds-py==0.27.0 @@ -226,11 +224,12 @@ threadpoolctl==3.6.0 tokenizers==0.21.4 torch==2.7.1 tqdm==4.67.1 -transformers==4.55.2 +transformers==4.55.4 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/ml/py39/base_image_requirements.txt b/sdks/python/container/ml/py39/base_image_requirements.txt index 3785f612a4af..7b55eb7a8e7b 100644 --- a/sdks/python/container/ml/py39/base_image_requirements.txt +++ b/sdks/python/container/ml/py39/base_image_requirements.txt @@ -67,8 +67,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -85,7 +85,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -98,23 +98,23 @@ guppy3==3.1.5 h11==0.16.0 h5py==3.14.0 hdfs==2.7.3 -hf-xet==1.1.7 +hf-xet==1.1.8 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 keras==3.10.0 keyring==25.6.0 @@ -124,16 +124,13 @@ Markdown==3.8.2 markdown-it-py==3.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 ml-dtypes==0.3.2 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 multidict==6.6.4 -mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.2.1 nltk==3.9.1 @@ -183,8 +180,9 @@ pydot==1.4.2 Pygments==2.19.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -198,7 +196,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rich==14.1.0 rpds-py==0.27.0 @@ -231,11 +229,12 @@ tokenizers==0.21.4 tomli==2.2.1 torch==2.7.1 tqdm==4.67.1 -transformers==4.55.2 +transformers==4.55.4 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 3f4deee29713..63d947772c2b 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -61,8 +61,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -79,7 +79,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -93,30 +93,27 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 MarkupSafe==3.0.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 multidict==6.6.4 -mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 oauth2client==4.1.3 @@ -147,8 +144,9 @@ pydantic_core==2.33.2 pydot==1.4.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -162,7 +160,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rpds-py==0.27.0 rsa==4.9.1 @@ -187,6 +185,7 @@ tqdm==4.67.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index d67181c22ba7..6ba596eeed3d 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -59,8 +59,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -77,7 +77,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -91,30 +91,27 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 MarkupSafe==3.0.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 multidict==6.6.4 -mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 oauth2client==4.1.3 @@ -145,8 +142,9 @@ pydantic_core==2.33.2 pydot==1.4.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -160,7 +158,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rpds-py==0.27.0 rsa==4.9.1 @@ -184,6 +182,7 @@ tqdm==4.67.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index 35f29dbfa644..c709b57164a8 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -58,8 +58,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -76,7 +76,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -90,30 +90,27 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 MarkupSafe==3.0.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 multidict==6.6.4 -mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 oauth2client==4.1.3 @@ -144,8 +141,9 @@ pydantic_core==2.33.2 pydot==1.4.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -159,7 +157,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rpds-py==0.27.0 rsa==4.9.1 @@ -183,6 +181,7 @@ tqdm==4.67.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/container/py313/base_image_requirements.txt b/sdks/python/container/py313/base_image_requirements.txt index fd7516f43295..7d73bf53a928 100644 --- a/sdks/python/container/py313/base_image_requirements.txt +++ b/sdks/python/container/py313/base_image_requirements.txt @@ -57,8 +57,8 @@ google-api-core==2.25.1 google-apitools==0.5.32 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -74,7 +74,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -88,13 +88,13 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 @@ -104,14 +104,11 @@ jsonschema-specifications==2025.4.1 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 MarkupSafe==3.0.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 multidict==6.6.4 -mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 oauth2client==4.1.3 @@ -142,8 +139,9 @@ pydantic_core==2.33.2 pydot==1.4.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.6.0 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -157,7 +155,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rpds-py==0.27.0 rsa==4.9.1 @@ -181,6 +179,7 @@ tqdm==4.67.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 423c66980410..810dfcc2a6e5 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -61,8 +61,8 @@ google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.109.0 -google-cloud-bigquery==3.35.1 +google-cloud-aiplatform==1.110.0 +google-cloud-bigquery==3.36.0 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 google-cloud-core==2.4.3 @@ -79,7 +79,7 @@ google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.30.0 +google-genai==1.31.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -93,30 +93,27 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.138.2 +hypothesis==6.138.3 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 jaraco.classes==3.4.0 jaraco.context==6.0.1 -jaraco.functools==4.2.1 +jaraco.functools==4.3.0 jeepney==0.9.0 Jinja2==3.1.6 joblib==1.5.1 jsonpickle==3.4.2 -jsonschema==4.25.0 +jsonschema==4.25.1 jsonschema-specifications==2025.4.1 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 MarkupSafe==3.0.2 -ujson==5.8.0 milvus-lite==2.5.1 -pymilvus==2.5.10 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 multidict==6.6.4 -mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.0.2 oauth2client==4.1.3 @@ -147,8 +144,9 @@ pydantic_core==2.33.2 pydot==1.4.2 PyHamcrest==2.1.0 PyJWT==2.10.1 -pymongo==4.14.0 -PyMySQL==1.1.1 +pymilvus==2.5.15 +pymongo==4.14.1 +PyMySQL==1.1.2 pyparsing==3.2.3 pyproject_hooks==1.2.0 pytest==7.4.4 @@ -162,7 +160,7 @@ PyYAML==6.0.2 redis==5.3.1 referencing==0.36.2 regex==2025.7.34 -requests==2.32.4 +requests==2.32.5 requests-mock==1.12.1 rpds-py==0.27.0 rsa==4.9.1 @@ -187,6 +185,7 @@ tqdm==4.67.1 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 +ujson==5.11.0 uritemplate==4.2.0 urllib3==2.5.0 virtualenv-clone==0.5.7 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index b88034174804..09288593d9fb 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -160,6 +160,8 @@ def cythonize(*args, **kwargs): 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.3', ] +milvus_dependency = ['pymilvus>=2.5.10,<3.0.0'] + def find_by_ext(root_dir, ext): for root, _, files in os.walk(root_dir): @@ -442,14 +444,12 @@ def get_portability_package_data(): 'cryptography>=41.0.2', 'hypothesis>5.0.0,<7.0.0', 'virtualenv-clone>=0.5,<1.0', - 'mysql-connector-python>=9.3.0', 'python-tds>=1.16.1', 'sqlalchemy-pytds>=1.0.2', 'pg8000>=1.31.1', "PyMySQL>=1.1.0", - 'oracledb>=3.1.1', - 'milvus' - ], + 'oracledb>=3.1.1' + ] + milvus_dependency, 'gcp': [ 'cachetools>=3.1.0,<7', 'google-api-core>=2.0.0,<3', @@ -596,7 +596,7 @@ def get_portability_package_data(): ], 'xgboost': ['xgboost>=1.6.0,<2.1.3', 'datatable==1.0.0'], 'tensorflow-hub': ['tensorflow-hub>=0.14.0,<0.16.0'], - 'milvus': ['pymilvus>=2.5.10,<3.0.0'] + 'milvus': milvus_dependency }, zip_safe=False, # PyPI package information. diff --git a/website/www/site/content/en/documentation/sdks/yaml.md b/website/www/site/content/en/documentation/sdks/yaml.md index 73d1eebaae95..33fad5b25506 100644 --- a/website/www/site/content/en/documentation/sdks/yaml.md +++ b/website/www/site/content/en/documentation/sdks/yaml.md @@ -708,7 +708,7 @@ the yaml file can be parameterized with externally provided variables using the [jinja variable syntax](https://jinja.palletsprojects.com/en/stable/templates/#variables). The values are then passed via a `--jinja_variables` command line flag. -For example, one could start a pipeline with +For example, one could start a pipeline with: ``` pipeline: @@ -742,6 +742,80 @@ or writing dated sources and sinks, e.g. would write to files like `gs://path/to/2016/08/04/dated-output*.json`. +A user can also use the `% include` directive to pull in other common templates: + +/pipeline.yaml +```yaml +pipeline: + transforms: + - name: Read from GCS + type: ReadFromText + config: +# NOTE: For include, the indentation has to line up correctly for it to be +# parsed correctly. So in this example the included readFromText.yaml has +# already indented yaml lines to line up correctly when including into this +# pipeline here. +{% include '/submodules/readFromText.yaml' %} + - name: Write to GCS + type: WriteToText + input: Read from GCS + config: + path: "gs://MY-BUCKET/wordCounts/" +``` + +/submodules/readFromText.yaml +```yaml + path: {{readFromText.path}} +``` + +This pipeline can be run like this: + +```sh +python -m apache_beam.yaml.main \ + --yaml_pipeline_file=pipeline.yaml \ + --jinja_variables='{"readFromText": {"path": "gs://dataflow-samples/shakespeare/kinglear.txt"}}' +``` + +The `% import` jinja directive can also be used to pull in macros: + +/pipeline.yaml +```yaml +{% import '/macros.yaml' as macros %} + +pipeline: + type: chain + transforms: + +# Read in text file +{{ macros.readFromText(readFromText) | indent(4, true) }} + +# Write to text file on GCS, locally, etc + - name: Write to GCS + type: WriteToText + input: Read from GCS + config: + path: "gs://MY-BUCKET/wordCounts/" +``` + +/macros.yaml +```yaml +{%- macro readFromText(params) -%} +- name: Read from GCS + type: ReadFromText + config: + path: "{{ params.path }}" +{%- endmacro -%} +``` + +This pipeline can be run with the same command as in the `% include` example +above. + +There are many more ways to import and even use template inheritance using +Jinja as seen [here](https://jinja.palletsprojects.com/en/stable/templates/#import) +and [here](https://jinja.palletsprojects.com/en/stable/templates/#inheritance). + +Full jinja pipeline examples can be found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples/transforms/jinja). + ## Other Resources * [Example pipeline](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples)