From 010f7d019be7b0da18ff97490706d34eec01cb0a Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 2 Dec 2024 14:28:30 +0000 Subject: [PATCH 1/2] md5 sums computation while listing files --- ait/commons/util/__main__.py | 6 +- ait/commons/util/command/list.py | 96 ++++++++++++++++++++++--- ait/commons/util/command/submit.py | 4 +- ait/commons/util/command/submit_file.py | 4 +- ait/commons/util/command/view.py | 4 +- ait/commons/util/provider_api_util.py | 2 +- 6 files changed, 98 insertions(+), 18 deletions(-) diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 987197b..efa2d21 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -113,7 +113,8 @@ def parse_args(args): # parser_clear.add_argument('-a', action='store_true', help='clear all - selection and known dirs') parser_list = cmd_parser.add_parser('list', help='list contents of the area') - parser_list.add_argument('-b', action='store_true', help='list all areas in the S3 bucket (authorised users only)') + parser_list.add_argument('-processing', action='store_true', help='access the processed data (authorised users ' + 'only)') # parser_upload = cmd_parser.add_parser('upload', help='upload files to the area') # group_upload = parser_upload.add_mutually_exclusive_group(required=True) @@ -143,7 +144,8 @@ def parse_args(args): group_delete.add_argument('-d', action='store_true', help='delete upload area and contents (authorised users only)') parser_sync = cmd_parser.add_parser('sync', - help='copy data from selected upload area to ingest upload area (authorised users only)') + help='copy data from selected upload area to ingest upload area (authorised ' + 'users only)') parser_sync.add_argument('INGEST_UPLOAD_AREA', help='Ingest upload area', type=valid_ingest_upload_area) ps = [parser] diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index ef5261e..cf79917 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -1,5 +1,10 @@ +import hashlib +import csv + from ait.commons.util.common import format_err from ait.commons.util.local_state import get_selected_area +from ait.commons.util.user_profile import get_profile +from urllib.parse import urlparse def print_area(k, area): @@ -20,6 +25,34 @@ def print_area(k, area): print() +def get_s3_path(): + while True: + s3_path = input("Enter the S3 path (e.g., s3://bucket-name/folder/): ").strip() + parsed_url = urlparse(s3_path) + + if parsed_url.scheme == 's3' and parsed_url.netloc: + return s3_path + else: + print("Invalid S3 path. Please enter a valid S3 path starting with 's3://'.") + + +def calculate_md5(s3_client, bucket_name, key): + md5_hash = hashlib.md5() + + try: + # Stream the object in chunks + response = s3_client.get_object(Bucket=bucket_name, Key=key) + + for chunk in response['Body'].iter_chunks(chunk_size=8192): + md5_hash.update(chunk) + + return md5_hash.hexdigest() + except Exception as e: + print(f"Failed to compute MD5 for {key}: {e}") + + return None + + class CmdList: """ admin and user @@ -29,22 +62,67 @@ class CmdList: def __init__(self, aws, args): self.aws = aws self.args = args + self.user = get_profile('morphic-util').username + self.processing = getattr(self.args, 'processing', None) self.s3_cli = self.aws.common_session.client('s3') def run(self): - selected_area = get_selected_area() # select area is a S3 bucket + if self.processing: + if self.user != 'morphic-admin': + return False, "Admin function only" + else: + print("Access granted") + + s3_path = get_s3_path() + self.list_s3_files(s3_path) + + return True, None + + else: + selected_area = get_selected_area() # select area is a S3 bucket + + if not selected_area: + return False, 'No area selected' + + try: + self.list_bucket_contents(selected_area) + # print_count(folder_count + files_count) + return True, None + + except Exception as e: + return False, format_err(e, 'list') + + def list_s3_files(self, s3_path): + parsed_url = urlparse(s3_path) + bucket_name = parsed_url.netloc + prefix = parsed_url.path.lstrip('/') + output_file = 's3_file_md5s.tsv' + + with open(output_file, 'w', newline='') as csvfile: + tsv_writer = csv.writer(csvfile, delimiter=',') + tsv_writer.writerow(['File Name', 'MD5 Hash']) # Write header row + + try: + response = self.s3_cli.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + + if 'Contents' in response: + print(f"\nFiles in '{s3_path}'") - if not selected_area: - return False, 'No area selected' + for obj in response['Contents']: + file_key = obj['Key'] + if not file_key.endswith('/'): # Skip folders + md5_hash = calculate_md5(self.s3_cli, bucket_name, file_key) - try: - self.list_bucket_contents(selected_area) - # print_count(folder_count + files_count) - return True, None + if md5_hash: + print(f"{file_key} - MD5: {md5_hash}") + tsv_writer.writerow([file_key, md5_hash]) # Write to file + else: + print("\nNo files found.") + except Exception as e: + print(f"\nError: {e}") - except Exception as e: - return False, format_err(e, 'list') + print(f"\nResults saved to {output_file}") def list_bucket_contents(self, selected_area, prefix=''): result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 8dc7130..678a6be 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -9,7 +9,7 @@ from ait.commons.util.spreadsheet_util import SubmissionError from ait.commons.util.user_profile import get_profile -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi def matching_expression_alteration_and_cell_line(cell_line, expression_alteration): @@ -206,7 +206,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.type = getattr(self.args, 'type', None) self.file = getattr(self.args, 'file', None) - self.provider_api = APIProvider(self.BASE_URL) + self.provider_api = ProviderApi(self.BASE_URL) def run(self): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 90c129f..0eb2e35 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -10,7 +10,7 @@ from ait.commons.util.command.submit import CmdSubmit, get_entity_id_from_hal_link, create_new_submission_envelope from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ merge_differentiated_cell_line_and_library_preparation, SubmissionError @@ -88,7 +88,7 @@ def __init__(self, args): self.user_profile = get_profile('morphic-util') self.access_token = self.user_profile.access_token self.aws = Aws(self.user_profile) - self.provider_api = APIProvider(self.BASE_URL) + self.provider_api = ProviderApi(self.BASE_URL) self.validation_errors = [] self.submission_errors = [] self.submission_envelope_id = None diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py index aa8fc74..3ba3533 100644 --- a/ait/commons/util/command/view.py +++ b/ait/commons/util/command/view.py @@ -1,5 +1,5 @@ from ait.commons.util.aws_client import Aws -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi from ait.commons.util.user_profile import get_profile @@ -10,7 +10,7 @@ def __init__(self, args): self.args = args self.access_token = get_profile('morphic-util').access_token self.user_profile = get_profile('morphic-util') - self.provider_api = APIProvider(self.base_url) + self.provider_api = ProviderApi(self.base_url) if hasattr(self.args, 'dataset') and self.args.dataset is not None: self.dataset = self.args.dataset diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py index 851b052..24a15fe 100644 --- a/ait/commons/util/provider_api_util.py +++ b/ait/commons/util/provider_api_util.py @@ -1,7 +1,7 @@ import requests -class APIProvider: +class ProviderApi: def __init__(self, base_url): self.base_url = base_url From fa13fd04ac400e2937b9cdeaa3f488f3398c56b4 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 9 Dec 2024 09:47:04 +0000 Subject: [PATCH 2/2] don't delete the dataset object --- ait/commons/util/command/submit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 678a6be..e1c972d 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -1282,5 +1282,5 @@ def delete_dataset(self, dataset, access_token): print(f"Deleting {data_file}") self.provider_api.delete(f"{self.BASE_URL}/files/{data_file}", access_token) - print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) + # print(f"\nDeleting the dataset: {dataset}") + # self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token)