From 131555b97b8c572fd078bc11ef6e1cdb2f02e2d1 Mon Sep 17 00:00:00 2001 From: Tony Lampada Date: Thu, 9 Oct 2025 11:56:32 -0500 Subject: [PATCH 1/5] Add script to generate S3 signed URLs for image files in JSONL format --- scripts/generateS3SignedUrls.sh | 69 +++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 scripts/generateS3SignedUrls.sh diff --git a/scripts/generateS3SignedUrls.sh b/scripts/generateS3SignedUrls.sh new file mode 100644 index 00000000..dbad4937 --- /dev/null +++ b/scripts/generateS3SignedUrls.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Script to generate S3 signed URLs for image files in JSONL format +# Usage: ./generateS3SignedUrls.sh [output-file] [expiration-seconds] [parallel-jobs] +# Or with curl: +# curl -fsSL https://gist.githubusercontent.com/tonylampada/20b7bc984a455f53e2d07f88b33bf43c/raw/generateS3SignedUrls.sh | bash -s -- s3://bucket/path output.jsonl + +set -e + +# Check if S3 path is provided +if [ -z "$1" ]; then + echo "Error: S3 path is required" + echo "Usage: $0 [output-file] [expiration-seconds] [parallel-jobs]" + echo "Example: $0 s3://my-bucket/images/ output.jsonl 3600 8" + exit 1 +fi + +S3_PATH="$1" +OUTPUT_FILE="${2:-signed_urls.jsonl}" +EXPIRATION="${3:-21600}" # Default: 6 hours +PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs + +# Remove trailing slash from S3 path if present +S3_PATH="${S3_PATH%/}" + +# Extract bucket name from S3_PATH +BUCKET=$(echo "$S3_PATH" | sed 's|s3://||' | cut -d'/' -f1) + +# Image file extensions to include (regex pattern for grep) +IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$' + +# Function to process a single file +process_file() { + local file_path="$1" + local bucket="$2" + local expiration="$3" + + # Construct full S3 URI + local s3_uri="s3://${bucket}/${file_path}" + + # Generate signed URL + local signed_url=$(aws s3 presign "$s3_uri" --expires-in "$expiration" 2>/dev/null) + + if [ $? -eq 0 ]; then + # Create name with full path using double underscores instead of slashes + local name_with_path=$(echo "$file_path" | sed 's|/|__|g') + + # Output JSONL + echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}" + fi +} + +# Export function and variables for xargs +export -f process_file +export BUCKET +export EXPIRATION + +echo "Listing files from $S3_PATH..." + +# Get list of all files, filter for images, and process in parallel +aws s3 ls "$S3_PATH/" --recursive | \ + awk '{print $4}' | \ + grep -iE "$IMAGE_PATTERN" | \ + xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$BUCKET" "$EXPIRATION" | \ + tee "$OUTPUT_FILE" + +echo "" +echo "Done! Signed URLs written to $OUTPUT_FILE" +echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" \ No newline at end of file From 19b96da77ac28312e1bf12cc97c362a53d3088ec Mon Sep 17 00:00:00 2001 From: Tony Lampada Date: Thu, 9 Oct 2025 11:58:56 -0500 Subject: [PATCH 2/5] update curl url --- scripts/generateS3SignedUrls.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generateS3SignedUrls.sh b/scripts/generateS3SignedUrls.sh index dbad4937..c99e6761 100644 --- a/scripts/generateS3SignedUrls.sh +++ b/scripts/generateS3SignedUrls.sh @@ -3,7 +3,7 @@ # Script to generate S3 signed URLs for image files in JSONL format # Usage: ./generateS3SignedUrls.sh [output-file] [expiration-seconds] [parallel-jobs] # Or with curl: -# curl -fsSL https://gist.githubusercontent.com/tonylampada/20b7bc984a455f53e2d07f88b33bf43c/raw/generateS3SignedUrls.sh | bash -s -- s3://bucket/path output.jsonl +# curl -fsSL https://raw.githubusercontent.com/roboflow/roboflow-python/main/scripts/generateS3SignedUrls.sh | bash -s -- s3://bucket/path output.jsonl set -e From d83ca6ca2544c57b7044a1a7cbb1d14b2b43e5ab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:00:06 +0000 Subject: [PATCH 3/5] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/generateS3SignedUrls.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generateS3SignedUrls.sh b/scripts/generateS3SignedUrls.sh index c99e6761..9391ee2f 100644 --- a/scripts/generateS3SignedUrls.sh +++ b/scripts/generateS3SignedUrls.sh @@ -66,4 +66,4 @@ aws s3 ls "$S3_PATH/" --recursive | \ echo "" echo "Done! Signed URLs written to $OUTPUT_FILE" -echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" \ No newline at end of file +echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" From 6a7d5580ff7dffdcb257526c5ddb6dc963a464b1 Mon Sep 17 00:00:00 2001 From: Tony Lampada Date: Thu, 9 Oct 2025 12:03:21 -0500 Subject: [PATCH 4/5] generate signed URLs for GCS --- scripts/generateGCSSignedUrls.sh | 108 +++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 scripts/generateGCSSignedUrls.sh diff --git a/scripts/generateGCSSignedUrls.sh b/scripts/generateGCSSignedUrls.sh new file mode 100644 index 00000000..c61e2282 --- /dev/null +++ b/scripts/generateGCSSignedUrls.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Script to generate GCS signed URLs for image files in JSONL format +# Usage: ./listgcs.sh [output-file] [expiration-seconds] [parallel-jobs] + +set -e + +# Check if GCS path is provided +if [ -z "$1" ]; then + echo "Error: GCS path is required" + echo "Usage: $0 [output-file] [expiration-seconds] [parallel-jobs]" + echo "Example: $0 gs://my-bucket/images/ output.jsonl 21600 8" + exit 1 +fi + +GCS_PATH="$1" +OUTPUT_FILE="${2:-signed_urls.jsonl}" +EXPIRATION_SECONDS="${3:-21600}" # Default: 6 hours +PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs + +# Remove trailing slash from GCS path if present +GCS_PATH="${GCS_PATH%/}" + +# Convert seconds to duration format for gcloud (e.g., 21600s) +EXPIRATION="${EXPIRATION_SECONDS}s" + +# Image file extensions to include (regex pattern for grep) +IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$' + +# Function to find an appropriate service account +find_service_account() { + # First, try to get the default compute service account for the current project + local project_id=$(gcloud config get-value project 2>/dev/null) + if [ -n "$project_id" ]; then + local compute_sa="${project_id}-compute@developer.gserviceaccount.com" + if gcloud iam service-accounts describe "$compute_sa" >/dev/null 2>&1; then + echo "$compute_sa" + return 0 + fi + fi + + # If that doesn't work, try to find any service account in the project + local sa_list=$(gcloud iam service-accounts list --format="value(email)" --limit=1 2>/dev/null) + if [ -n "$sa_list" ]; then + echo "$sa_list" | head -n 1 + return 0 + fi + + return 1 +} + +# Try to find a service account to use +SERVICE_ACCOUNT=$(find_service_account) +if [ -z "$SERVICE_ACCOUNT" ]; then + echo "Warning: No service account found. Attempting to sign URLs without impersonation." + echo "If this fails, you may need to:" + echo "1. Authenticate with a service account: gcloud auth activate-service-account --key-file=key.json" + echo "2. Or ensure you have appropriate service accounts in your project" + echo "" +fi + +# Function to process a single file +process_file() { + local object="$1" + local service_account="$2" + local expiration="$3" + + # Create signed URL using gcloud storage sign-url + local signed_url_output + if [ -n "$service_account" ]; then + signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" --impersonate-service-account="$service_account" "$object" 2>/dev/null) + else + signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" "$object" 2>/dev/null) + fi + + if [ $? -eq 0 ] && [ -n "$signed_url_output" ]; then + # Extract just the signed_url from the YAML output + local signed_url=$(echo "$signed_url_output" | grep "signed_url:" | sed 's/signed_url: //') + + if [ -n "$signed_url" ]; then + # Extract the path after the bucket name and convert slashes to double underscores + local path_part=$(echo "$object" | sed 's|gs://[^/]*/||') + local name_with_path=$(echo "$path_part" | sed 's|/|__|g') + + # Output JSONL + echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}" + fi + fi +} + +# Export function and variables for xargs +export -f process_file +export SERVICE_ACCOUNT +export EXPIRATION + +echo "Listing files from $GCS_PATH..." + +# Get list of all files, filter for images, and process in parallel +gsutil ls -r "$GCS_PATH" 2>/dev/null | \ + grep -v '/$' | \ + grep -v ':$' | \ + grep -iE "$IMAGE_PATTERN" | \ + xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$SERVICE_ACCOUNT" "$EXPIRATION" | \ + tee "$OUTPUT_FILE" + +echo "" +echo "Done! Signed URLs written to $OUTPUT_FILE" +echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" \ No newline at end of file From 753db107d989c91be161830b17382bf1ca354ebb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:03:44 +0000 Subject: [PATCH 5/5] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/generateGCSSignedUrls.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/generateGCSSignedUrls.sh b/scripts/generateGCSSignedUrls.sh index c61e2282..fd44f5a5 100644 --- a/scripts/generateGCSSignedUrls.sh +++ b/scripts/generateGCSSignedUrls.sh @@ -38,14 +38,14 @@ find_service_account() { return 0 fi fi - + # If that doesn't work, try to find any service account in the project local sa_list=$(gcloud iam service-accounts list --format="value(email)" --limit=1 2>/dev/null) if [ -n "$sa_list" ]; then echo "$sa_list" | head -n 1 return 0 fi - + return 1 } @@ -76,7 +76,7 @@ process_file() { if [ $? -eq 0 ] && [ -n "$signed_url_output" ]; then # Extract just the signed_url from the YAML output local signed_url=$(echo "$signed_url_output" | grep "signed_url:" | sed 's/signed_url: //') - + if [ -n "$signed_url" ]; then # Extract the path after the bucket name and convert slashes to double underscores local path_part=$(echo "$object" | sed 's|gs://[^/]*/||') @@ -105,4 +105,4 @@ gsutil ls -r "$GCS_PATH" 2>/dev/null | \ echo "" echo "Done! Signed URLs written to $OUTPUT_FILE" -echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" \ No newline at end of file +echo "Total images processed: $(wc -l < "$OUTPUT_FILE")"