From 13533916c6cb1239f0132cdeb34ae184eb248657 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 2 Apr 2024 17:03:42 +0100 Subject: [PATCH 01/55] Upload areas naming changed, using submission envelope UUID, morphic-bio// --- ait/commons/util/aws_cognito_authenticator.py | 23 ------------------- ait/commons/util/command/create.py | 4 ++-- ait/commons/util/command/list.py | 11 +-------- ait/commons/util/command/select.py | 14 ++--------- ait/commons/util/command/upload.py | 9 -------- ait/commons/util/settings/morphic_util.py | 7 ++---- requirements.txt | 4 ++-- 7 files changed, 9 insertions(+), 63 deletions(-) diff --git a/ait/commons/util/aws_cognito_authenticator.py b/ait/commons/util/aws_cognito_authenticator.py index 7527a97..e83d633 100644 --- a/ait/commons/util/aws_cognito_authenticator.py +++ b/ait/commons/util/aws_cognito_authenticator.py @@ -101,29 +101,6 @@ def get_secret_manager_client(self, username, password): else: self.is_user = True - for attr in user_attribute_list: - if attr['Name'] == 'custom:DPC': - self.center_name = attr['Value'].lower() - - if attr['Name'] == 'custom:directory_access': - self.user_dir_list = attr['Value'].replace(" ", "").split(',') - - if self.user_dir_list is not None: - self.user_dir_list = ['morphic-' + self.center_name + '/' + dataset_dir for dataset_dir in - self.user_dir_list] - - if self.is_user: - if self.center_name is None: - print('User does not have an assigned center name and therefore cannot perform any operations ' - 'with this system') - sys.exit(1) - - if self.user_dir_list is None: - if self.is_user: - print('User does not have access to any upload areas or to perform any operations with this' - 'system') - sys.exit(1) - identity = boto3.client('cognito-identity', region_name=DEFAULT_REGION) identity_id = identity.get_id( diff --git a/ait/commons/util/command/create.py b/ait/commons/util/command/create.py index e2e5da6..e1e7c66 100644 --- a/ait/commons/util/command/create.py +++ b/ait/commons/util/command/create.py @@ -24,7 +24,7 @@ def run(self): if self.aws.is_user: return False, 'You don\'t have permission to use this command' - area_name = self.args.NAME # S3 bucket folder name + area_name = self.args.NAME # S3 bucket folder name perms = self.args.p # optional str, default 'ux' center_name = self.args.DPC # morphic DPC @@ -33,7 +33,7 @@ def run(self): # new upload areas to be created with tagging instead of metadata # upload area format - morphic-DPC/area_name/ s3_client.put_object(Bucket=self.aws.bucket_name, - Key=('morphic-' + center_name.lower() + '/' + area_name + '/'), + Key=(area_name + '/'), Tagging=f'name={area_name}&perms={perms}') if perms == DEFAULT_PERMS: diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index 853319e..45154e6 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -37,16 +37,7 @@ def run(self): if not selected_area: return False, 'No area selected' - else: - if self.aws.is_user: - dir_prefix = 'morphic-' + self.aws.center_name + '/' - - if dir_prefix not in selected_area: - selected_area = dir_prefix + selected_area - - if selected_area.rstrip(selected_area[-1]) not in self.aws.user_dir_list: - return False, "Upload area does not exist or you don't have access to this area" - + try: selected_area += '' if selected_area.endswith('/') else '/' n, p = self.get_name_and_perms(selected_area) diff --git a/ait/commons/util/command/select.py b/ait/commons/util/command/select.py index 60f3ff5..c70e6c8 100644 --- a/ait/commons/util/command/select.py +++ b/ait/commons/util/command/select.py @@ -17,19 +17,9 @@ def run(self): if self.args.AREA: key = self.args.AREA if self.args.AREA.endswith('/') else f'{self.args.AREA}/' - if self.aws.is_user: - key = 'morphic-' + self.aws.center_name + '/' + key - if self.aws.obj_exists(key): - if not self.aws.is_user: - set_selected_area(key) - return True, f'Selected upload area is {key}' - else: - if key.rstrip(key[-1]) in self.aws.user_dir_list: - set_selected_area(key) - return True, f'Selected upload area is {key}' - else: - return False, f'Upload area does not exist or you do not have access to this area - {key}' + set_selected_area(key) + return True, f'Selected upload area is {key}' else: return False, f'Upload area does not exist - {key}' else: diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index d2bcbe3..31ecaed 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -75,15 +75,6 @@ def run(self): if not selected_area: return False, 'No area selected' - if self.aws.is_user: - dir_prefix = 'morphic-' + self.aws.center_name + '/' - - if dir_prefix not in selected_area: - selected_area = dir_prefix + selected_area - - if selected_area.rstrip(selected_area[-1]) not in self.aws.user_dir_list: - return False, "Upload area does not exist or you don't have access to this area" - try: ps = [] diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index a13ef06..0c6d94a 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,10 +1,9 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.8' +VERSION = '0.0.9' DESC = 'CLI tool for uploading data to Morphic AWS S3 bucket' -AUTHOR = 'hca-ingest-dev' -AUTHOR_EMAIL = 'hca-ingest-dev@ebi.ac.uk' +AUTHOR = 'morphic-bio-dev' # when true, displays exception details; otherwise user-friendly error message DEBUG_MODE = False @@ -37,8 +36,6 @@ COGNITO_IDENTITY_POOL_ID = 'eu-west-2:87ba188b-51fc-42e0-9172-a1a01cda8ed0' COGNITO_USER_POOL_ID = 'eu-west-2_2BpGQDRSU' IAM_USER = 'morphic-admin' -AWS_ACCOUNT = '596988661787' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' AWS_SECRET_NAME_SK_BUCKET = 'SK-bucket' -AWS_SECRET_NAME_MORPHIC_BUCKET = 's3-bucket' diff --git a/requirements.txt b/requirements.txt index bddbdc3..28d6373 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -boto3>=1.26.153 -botocore>=1.29.153 +boto3>=1.23.10 +botocore>=1.26.10 filetype==1.0.7 requests>=2.20.0, <3 urllib3<1.27, >=1.25.4 From ae49d30751bd97fe5c07aeb058da578a9dbca688 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 23 Apr 2024 16:16:06 +0100 Subject: [PATCH 02/55] Metadata submission support to morphic-util --- ait/commons/util/__main__.py | 4 + ait/commons/util/aws_client.py | 39 ++++---- ait/commons/util/aws_cognito_authenticator.py | 15 +-- ait/commons/util/bucket_policy.py | 28 +++--- ait/commons/util/cmd.py | 5 + ait/commons/util/command/config.py | 23 +++-- ait/commons/util/command/select.py | 2 +- ait/commons/util/command/submit.py | 96 +++++++++++++++++++ ait/commons/util/settings/morphic_util.py | 8 +- ait/commons/util/user_profile.py | 6 +- 10 files changed, 164 insertions(+), 62 deletions(-) create mode 100644 ait/commons/util/command/submit.py diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 8034ae4..7d0c82f 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -79,6 +79,10 @@ def parse_args(args): parser_config.add_argument('PASSWORD', help='AWS Cognito password', nargs='?') parser_config.add_argument('--bucket', help='use BUCKET instead of default bucket') + parser_config = cmd_parser.add_parser('submit', help='submit your metadata') + parser_config.add_argument('--type', help='data type you are submitting, e.g. study, dataset') + parser_config.add_argument('--file', help='your metadata') + parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) parser_create.add_argument('DPC', help='center name of the submitter', type=valid_project_name) diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 38b069a..1a99d94 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -4,7 +4,7 @@ from ait.commons.util.aws_cognito_authenticator import AwsCognitoAuthenticator from ait.commons.util.settings import AWS_SECRET_NAME_AK_BUCKET, AWS_SECRET_NAME_SK_BUCKET, \ - AWS_SECRET_NAME_MORPHIC_BUCKET, COGNITO_MORPHIC_UTIL_ADMIN, S3_REGION + COGNITO_MORPHIC_UTIL_ADMIN, S3_REGION def static_bucket_name(): @@ -42,21 +42,22 @@ def get_bucket_name(self, secret_mgr_client): """ # access policy can't be attached to a secret # GetSecretValue action should be allowed for user - resp = secret_mgr_client.get_secret_value(SecretId=AWS_SECRET_NAME_MORPHIC_BUCKET) + resp = secret_mgr_client.get_secret_value(SecretId='') secret_str = resp['SecretString'] self.bucket_name = json.loads(secret_str)['s3-bucket'] return self.bucket_name def new_session(self): aws_cognito_authenticator = AwsCognitoAuthenticator(self) - secret_manager_client = aws_cognito_authenticator.get_secret_manager_client(self.user_profile.username, - self.user_profile.password) + secret_manager_client = aws_cognito_authenticator.secret_manager_client_instance(self.user_profile.username, + self.user_profile.password) if secret_manager_client is None: - print('Failure while re-establishing Amazon Web Services session, report this error to the DRACC admin') + print( + 'Failure while re-establishing Amazon Web Services session, report this error to the MorPhiC DRACC admin') raise Exception else: - self.is_user = aws_cognito_authenticator.is_valid_user() + self.is_user = aws_cognito_authenticator.is_user self.user_dir_list = aws_cognito_authenticator.get_user_dir_list() self.center_name = aws_cognito_authenticator.get_center_name() @@ -87,20 +88,14 @@ def is_valid_user(self): def obj_exists(self, key): """ - return true if key exists, else false - A folder/directory is an s3 object with key / - Note: s3://my-bucket/folder != s3://my-bucket/folder/ - Refer to https://www.peterbe.com/plog/fastest-way-to-find-out-if-a-file-exists-in-s3 - for comparison between client.list_objects_v2 and client.head_object to make this check. - Also check https://stackoverflow.com/questions/33842944/check-if-a-key-exists-in-a-bucket-in-s3-using-boto3 - which suggests using Object.load() - which does a HEAD request, however, user doesn't have - s3:GetObject permission by default, so this will fail for them. + Returns True if the bucket exists, else False. """ - response = self.new_session().client('s3').list_objects_v2( - Bucket=self.bucket_name, - Prefix=key, - ) - for obj in response.get('Contents', []): - if obj['Key'] == key: - return True - return False + client = self.common_session.client('s3') + try: + client.head_bucket( + Bucket=key + ) + return True + except client.exceptions.NoSuchBucket as e: + print(f"The bucket '{key}' does not exist. Reason: {e}") + return False diff --git a/ait/commons/util/aws_cognito_authenticator.py b/ait/commons/util/aws_cognito_authenticator.py index e83d633..d31190e 100644 --- a/ait/commons/util/aws_cognito_authenticator.py +++ b/ait/commons/util/aws_cognito_authenticator.py @@ -1,7 +1,5 @@ import sys - import boto3 - from ait.commons.util.settings import DEFAULT_PROFILE, DEFAULT_REGION, COGNITO_CLIENT_ID, COGNITO_IDENTITY_POOL_ID, \ COGNITO_USER_POOL_ID from ait.commons.util.user_profile import set_profile @@ -18,8 +16,7 @@ def __init__(self, args): self.user_dir_list = None self.center_name = None # custom attribute DPC - def validate_cognito_identity(self, profile, username, password): - + def is_registered_user(self, profile, username, password): try: profile = profile if profile else DEFAULT_PROFILE @@ -64,7 +61,7 @@ def validate_cognito_identity(self, profile, username, password): if session_token: set_profile(profile, DEFAULT_REGION, aws_cred['AccessKeyId'], aws_cred['SecretKey'], - session_token, username, password) + session_token, access_token, username, password) return True else: @@ -74,8 +71,7 @@ def validate_cognito_identity(self, profile, username, password): except Exception as e: return False - def get_secret_manager_client(self, username, password): - + def secret_manager_client_instance(self, username, password): try: if username and password: client = boto3.client("cognito-idp", region_name=DEFAULT_REGION, aws_access_key_id="NONE", @@ -90,11 +86,8 @@ def get_secret_manager_client(self, username, password): # Getting the user details. access_token = response["AuthenticationResult"]["AccessToken"] id_token = response["AuthenticationResult"]["IdToken"] - response = client.get_user(AccessToken=access_token) - username = response['Username'] - user_attribute_list = response['UserAttributes'] if username.endswith('Admin') or username.endswith('admin'): self.is_user = False @@ -133,7 +126,7 @@ def get_secret_manager_client(self, username, password): except Exception as e: return None - def is_valid_user(self): + def is_user(self): return self.is_user def get_user_dir_list(self): diff --git a/ait/commons/util/bucket_policy.py b/ait/commons/util/bucket_policy.py index 2ea4afa..8734783 100755 --- a/ait/commons/util/bucket_policy.py +++ b/ait/commons/util/bucket_policy.py @@ -1,4 +1,4 @@ -from ait.commons.util.settings import AWS_ACCOUNT, IAM_USER +from ait.commons.util.settings import IAM_USER """ User groups: @@ -50,6 +50,7 @@ ALLOWED_PERMS = ['u', 'ud', 'ux', 'udx'] DEFAULT_PERMS = 'ux' + # constraints - in bucket policy # ux denyDelete -> u # ux allowDownload -> udx @@ -57,18 +58,19 @@ def allowDownloadStmt(): return { - "Sid": "AllowDownload", - "Effect": "Allow", - "Action": "s3:GetObject", - "Resource": [], - "Principal": { "AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} -} + "Sid": "AllowDownload", + "Effect": "Allow", + "Action": "s3:GetObject", + "Resource": [], + "Principal": {"AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} + } + def denyDeleteStmt(): return { - "Sid": "DenyDelete", - "Effect": "Deny", - "Action": "s3:DeleteObject", - "Resource": [], - "Principal": { "AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} -} + "Sid": "DenyDelete", + "Effect": "Deny", + "Action": "s3:DeleteObject", + "Resource": [], + "Principal": {"AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} + } diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index dfff6ea..c0548a9 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -10,6 +10,7 @@ from ait.commons.util.command.download import CmdDownload from ait.commons.util.command.list import CmdList from ait.commons.util.command.select import CmdSelect +from ait.commons.util.command.submit import CmdSubmit from ait.commons.util.command.sync import CmdSync from ait.commons.util.command.upload import CmdUpload from ait.commons.util.local_state import get_bucket, set_attr, get_attr @@ -37,6 +38,10 @@ def __init__(self, args): success, msg = CmdConfig(args).run() print(msg) + elif args.command == 'submit': + success, msg = CmdSubmit(args).run() + print(msg) + else: if profile_exists(args.profile): self.user_profile = get_profile(args.profile) diff --git a/ait/commons/util/command/config.py b/ait/commons/util/command/config.py index 43ac758..9eeb73b 100755 --- a/ait/commons/util/command/config.py +++ b/ait/commons/util/command/config.py @@ -15,22 +15,25 @@ def __init__(self, args): def run(self): + global valid_user + try: profile = self.args.profile if self.args.profile else DEFAULT_PROFILE + aws_cognito_authenticator = AwsCognitoAuthenticator(self) if self.args.bucket: set_bucket(self.args.bucket) if self.args.USERNAME and self.args.PASSWORD: - aws_cognito_authenticator = AwsCognitoAuthenticator(self) - - valid_user = aws_cognito_authenticator.validate_cognito_identity(profile, self.args.USERNAME, - self.args.PASSWORD) - - # check if valid user - if valid_user: - return True, 'Valid credentials' - else: - return False, 'Invalid credentials' + valid_user = aws_cognito_authenticator.is_registered_user(profile, self.args.USERNAME, + self.args.PASSWORD) + else: + print("No credentials provided!") + + # check if valid user + if valid_user: + return True, 'Valid credentials' + else: + return False, 'Invalid credentials' except Exception as e: return False, format_err(e, 'config') diff --git a/ait/commons/util/command/select.py b/ait/commons/util/command/select.py index c70e6c8..8b9bdbb 100644 --- a/ait/commons/util/command/select.py +++ b/ait/commons/util/command/select.py @@ -15,7 +15,7 @@ def __init__(self, aws, args): def run(self): try: if self.args.AREA: - key = self.args.AREA if self.args.AREA.endswith('/') else f'{self.args.AREA}/' + key = self.args.AREA if self.aws.obj_exists(key): set_selected_area(key) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py new file mode 100644 index 0000000..f953269 --- /dev/null +++ b/ait/commons/util/command/submit.py @@ -0,0 +1,96 @@ +import requests +import json +from urllib.parse import urlparse + +from ait.commons.util.user_profile import get_profile + + +def get_id_from_url(url): + parsed_url = urlparse(url) + path_parts = parsed_url.path.split('/') + return path_parts[2] + + +class CmdSubmit: + base_url = 'http://localhost:8080' + + def __init__(self, args): + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.type = self.args.type + self.data = None + + def run(self): + submission_envelope_create_url = f"{self.base_url}/submissionEnvelopes/updateSubmissions" + + if self.type == 'study': + study_create_url = self.post(submission_envelope_create_url, 'studies') + + submission_envelope_id = get_id_from_url(study_create_url) + + study_create_response = self.post(study_create_url, 'submissionEnvelopes') + link_study_to_submission_envelope_response = self.put(study_create_response + '/' + submission_envelope_id, + 'self') + + study_id = get_id_from_url(link_study_to_submission_envelope_response) + + print("Study created successfully: " + study_id) + + return True, study_id + elif self.type == 'dataset': + dataset_create_url = self.post(submission_envelope_create_url, 'datasets') + + submission_envelope_id = get_id_from_url(dataset_create_url) + + dataset_create_response = self.post(dataset_create_url, 'submissionEnvelopes') + link_dataset_to_submission_envelope_response = self.put( + dataset_create_response + '/' + submission_envelope_id, + 'self') + + dataset_id = get_id_from_url(link_dataset_to_submission_envelope_response) + + # Check if both study and dataset IDs are available + if dataset_id is not None: + # Prompt user to link dataset to study + link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() + + if link_to_study == 'yes': + study_id = input("Input study id: ").lower() + print("Linking dataset " + dataset_id + " to study " + study_id) + # Perform the linking operation here + + self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", None) + print("Dataset linked successfully to study: " + study_id) + else: + print("Dataset created successfully.") + return True, dataset_id + + def post(self, url, data_type_in_hal_link): + # Read content of the file + if self.args.file: + with open(self.args.file, 'r') as file: + self.data = json.load(file) + else: + self.data = None + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.access_token}' + } + response = requests.post(url, headers=headers, json=self.data) + response_data = response.json() + url = response_data['_links'][data_type_in_hal_link]['href'] + + return url + + def put(self, url, data_type_in_hal_link): + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.access_token}' + } + response = requests.put(url, headers=headers) + response_data = response.json() + + if data_type_in_hal_link is not None: + url = response_data['_links'][data_type_in_hal_link]['href'] + return url diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 0c6d94a..4485cea 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -31,10 +31,10 @@ LOCAL_STATE_FILE = USER_HOME + '/.hca-util' # Cognito and IAM -COGNITO_MORPHIC_UTIL_ADMIN = 'morphic-admin' -COGNITO_CLIENT_ID = '6poq2i04qt3pj5rkpg51patcrk' -COGNITO_IDENTITY_POOL_ID = 'eu-west-2:87ba188b-51fc-42e0-9172-a1a01cda8ed0' -COGNITO_USER_POOL_ID = 'eu-west-2_2BpGQDRSU' +COGNITO_MORPHIC_UTIL_ADMIN = 'morphic-dev-admin' +COGNITO_CLIENT_ID = '178j951qnfuheicm2m5rqqvg6q' +COGNITO_IDENTITY_POOL_ID = 'eu-west-2:d6531e9c-020d-4ee8-bf3b-255393c500e9' +COGNITO_USER_POOL_ID = 'eu-west-2_b4EyaLNCM' IAM_USER = 'morphic-admin' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' diff --git a/ait/commons/util/user_profile.py b/ait/commons/util/user_profile.py index acc1c11..3f44eb6 100755 --- a/ait/commons/util/user_profile.py +++ b/ait/commons/util/user_profile.py @@ -9,8 +9,10 @@ def __init__(self): self.access_key = None self.secret_key = None self.session_token = None + self.access_token = None self.username = None self.password = None + self.idToken = None self.region = None def __repr__(self): @@ -42,6 +44,7 @@ def get_profile(profile): user_profile.access_key = credentials[profile].get('aws_access_key_id') user_profile.secret_key = credentials[profile].get('aws_secret_access_key') user_profile.session_token = credentials[profile].get('aws_session_token') + user_profile.access_token = credentials[profile].get('aws_cognito_access_token') user_profile.username = credentials[profile].get('aws_cognito_username') user_profile.password = credentials[profile].get('aws_cognito_password') @@ -57,7 +60,7 @@ def get_profile(profile): return user_profile -def set_profile(profile, region, access_key, secret_key, session_token, username, password): +def set_profile(profile, region, access_key, secret_key, session_token, access_token, username, password): """.aws/config [profile {profile}] region = {region} @@ -91,6 +94,7 @@ def set_profile(profile, region, access_key, secret_key, session_token, username credentials.add_section(f'{profile}') credentials.set(f'{profile}', 'aws_cognito_username', username) credentials.set(f'{profile}', 'aws_cognito_password', password) + credentials.set(f'{profile}', 'aws_cognito_access_token', access_token) with open(AWS_CREDENTIALS_FILE, 'w') as out: credentials.write(out) From 114bf5ec59b21e9db47f7a275f6db8c5994b4d67 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 24 Apr 2024 11:24:37 +0100 Subject: [PATCH 03/55] Additional option to link dataset and study, making dataset no-body --- ait/commons/util/__main__.py | 1 + ait/commons/util/command/submit.py | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 7d0c82f..f4ba974 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -82,6 +82,7 @@ def parse_args(args): parser_config = cmd_parser.add_parser('submit', help='submit your metadata') parser_config.add_argument('--type', help='data type you are submitting, e.g. study, dataset') parser_config.add_argument('--file', help='your metadata') + parser_config.add_argument('--study', help='your study reference') parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index f953269..273b58b 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -49,29 +49,37 @@ def run(self): dataset_id = get_id_from_url(link_dataset_to_submission_envelope_response) - # Check if both study and dataset IDs are available + # Check if dataset ID is available if dataset_id is not None: - # Prompt user to link dataset to study - link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() + # Check if user has passed study to link, else prompt + if self.args.study is not None: + study_id = self.args.study - if link_to_study == 'yes': - study_id = input("Input study id: ").lower() - print("Linking dataset " + dataset_id + " to study " + study_id) - # Perform the linking operation here + self.link_dataset_study(dataset_id, study_id) + # Prompt user to link dataset to study + else: + link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() - self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", None) - print("Dataset linked successfully to study: " + study_id) + if link_to_study == 'yes': + study_id = input("Input study id: ").lower() + self.link_dataset_study(dataset_id, study_id) else: print("Dataset created successfully.") return True, dataset_id + def link_dataset_study(self, dataset_id, study_id): + print("Linking dataset " + dataset_id + " to study " + study_id) + # Perform the linking operation here + self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", None) + print("Dataset linked successfully to study: " + study_id) + def post(self, url, data_type_in_hal_link): # Read content of the file if self.args.file: with open(self.args.file, 'r') as file: self.data = json.load(file) else: - self.data = None + self.data = {} headers = { 'Content-Type': 'application/json', From ee88e9b8716b1a56a0c14b2bed7ec5aa406be9a0 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 25 Apr 2024 10:42:39 +0100 Subject: [PATCH 04/55] fix list and upload, messages improved --- ait/commons/util/aws_client.py | 23 +++++++++++++- ait/commons/util/command/list.py | 6 ++-- ait/commons/util/command/select.py | 2 +- ait/commons/util/command/submit.py | 48 ++++++++++++++++-------------- ait/commons/util/command/upload.py | 23 +++++++------- ait/commons/util/common.py | 1 + 6 files changed, 65 insertions(+), 38 deletions(-) diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 1a99d94..18ad5b9 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -1,6 +1,7 @@ import json import boto3 +import botocore from ait.commons.util.aws_cognito_authenticator import AwsCognitoAuthenticator from ait.commons.util.settings import AWS_SECRET_NAME_AK_BUCKET, AWS_SECRET_NAME_SK_BUCKET, \ @@ -86,7 +87,7 @@ def is_valid_credentials(self): def is_valid_user(self): return self.is_user - def obj_exists(self, key): + def s3_bucket_exists(self, key): """ Returns True if the bucket exists, else False. """ @@ -99,3 +100,23 @@ def obj_exists(self, key): except client.exceptions.NoSuchBucket as e: print(f"The bucket '{key}' does not exist. Reason: {e}") return False + + def data_file_exists(self, bucket_name, key): + """ + Check if an object exists in the specified S3 bucket. + + Parameters: + - bucket_name (str): The name of the S3 bucket. + - key (str): The key of the object in the bucket. + + Returns: + - bool: True if the object exists, False otherwise. + """ + client = self.common_session.client('s3') + + try: + client.head_object(Bucket=bucket_name, Key=key) + return True + except client.exceptions.ClientError: + return False + diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index 45154e6..f6849d4 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -37,9 +37,9 @@ def run(self): if not selected_area: return False, 'No area selected' - + try: - selected_area += '' if selected_area.endswith('/') else '/' + # selected_area += '' if selected_area.endswith('/') else '/' n, p = self.get_name_and_perms(selected_area) self.print_area(selected_area, dict(name=n, perms=p)) @@ -99,7 +99,7 @@ def list_area_contents(self, selected_area): contents = [] s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) + bucket = s3_resource.Bucket(selected_area) for obj in bucket.objects.filter(Prefix=selected_area): k = obj.key diff --git a/ait/commons/util/command/select.py b/ait/commons/util/command/select.py index 8b9bdbb..833fd52 100644 --- a/ait/commons/util/command/select.py +++ b/ait/commons/util/command/select.py @@ -17,7 +17,7 @@ def run(self): if self.args.AREA: key = self.args.AREA - if self.aws.obj_exists(key): + if self.aws.s3_bucket_exists(key): set_selected_area(key) return True, f'Selected upload area is {key}' else: diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 273b58b..a9b5807 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -24,30 +24,13 @@ def run(self): submission_envelope_create_url = f"{self.base_url}/submissionEnvelopes/updateSubmissions" if self.type == 'study': - study_create_url = self.post(submission_envelope_create_url, 'studies') - - submission_envelope_id = get_id_from_url(study_create_url) - - study_create_response = self.post(study_create_url, 'submissionEnvelopes') - link_study_to_submission_envelope_response = self.put(study_create_response + '/' + submission_envelope_id, - 'self') - - study_id = get_id_from_url(link_study_to_submission_envelope_response) - - print("Study created successfully: " + study_id) + study_id = self.create_study(submission_envelope_create_url) return True, study_id elif self.type == 'dataset': - dataset_create_url = self.post(submission_envelope_create_url, 'datasets') - - submission_envelope_id = get_id_from_url(dataset_create_url) - - dataset_create_response = self.post(dataset_create_url, 'submissionEnvelopes') - link_dataset_to_submission_envelope_response = self.put( - dataset_create_response + '/' + submission_envelope_id, - 'self') + dataset_id = self.create_dataset(submission_envelope_create_url) - dataset_id = get_id_from_url(link_dataset_to_submission_envelope_response) + print("Dataset created successfully: " + dataset_id) # Check if dataset ID is available if dataset_id is not None: @@ -63,9 +46,30 @@ def run(self): if link_to_study == 'yes': study_id = input("Input study id: ").lower() self.link_dataset_study(dataset_id, study_id) + return True, dataset_id else: - print("Dataset created successfully.") - return True, dataset_id + print("Unsupported type") + return False, "Unsupported type" + + def create_dataset(self, submission_envelope_create_url): + dataset_create_url = self.post(submission_envelope_create_url, 'datasets') + submission_envelope_id = get_id_from_url(dataset_create_url) + dataset_create_response = self.post(dataset_create_url, 'submissionEnvelopes') + link_dataset_to_submission_envelope_response = self.put( + dataset_create_response + '/' + submission_envelope_id, + 'self') + dataset_id = get_id_from_url(link_dataset_to_submission_envelope_response) + return dataset_id + + def create_study(self, submission_envelope_create_url): + study_create_url = self.post(submission_envelope_create_url, 'studies') + submission_envelope_id = get_id_from_url(study_create_url) + study_create_response = self.post(study_create_url, 'submissionEnvelopes') + link_study_to_submission_envelope_response = self.put(study_create_response + '/' + submission_envelope_id, + 'self') + study_id = get_id_from_url(link_study_to_submission_envelope_response) + print("Study created successfully: " + study_id) + return study_id def link_dataset_study(self, dataset_id, study_id): print("Linking dataset " + dataset_id + " to study " + study_id) diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index 31ecaed..6ec848c 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -20,12 +20,12 @@ def __init__(self, aws, args): self.args = args self.files = [] - def upload_file(self, data_file, key): + def upload_file(self, selected_area, data_file, destination_file): file_size = os.path.getsize(data_file) - if not self.args.o and self.aws.obj_exists(key): - print(f"{data_file} already exists. Use -o to overwrite.") + if not self.args.o and self.aws.data_file_exists(selected_area, destination_file): + print(f"{destination_file} already exists. Use -o to overwrite.") elif file_size == 0: print(f"{data_file} is an empty file") @@ -41,26 +41,27 @@ def upload_file(self, data_file, key): content_type = file_type.mime content_type += '; dcp-type=data' - s3.Bucket(self.aws.bucket_name).upload_file(Filename=data_file, - Key=key, - Callback=ProgressBar(target=data_file, total=file_size), - ExtraArgs={'ContentType': content_type} - ) + s3.Bucket(selected_area).upload_file(Filename=data_file, + Key=destination_file, + Callback=ProgressBar(target=data_file, total=file_size), + ExtraArgs={'ContentType': content_type} + ) def upload_files(self, data_files, prefix): + selected_area = prefix with ThreadPoolExecutor() as executor: futures = { - executor.submit(self.upload_file, data_file, - f"{prefix}{os.path.basename(data_file)}"): data_file + executor.submit(self.upload_file, selected_area, data_file, + os.path.basename(data_file)): data_file for data_file in data_files } # collect each finished job success = True for future in concurrent.futures.as_completed(futures): + data_file = futures[future] # Get the associated data_file try: - data_file = futures[future] future.result() # read the result of the future object except Exception as ex: print(f"Exception raised for {data_file}: ", ex) diff --git a/ait/commons/util/common.py b/ait/commons/util/common.py index 9345554..aced1e3 100755 --- a/ait/commons/util/common.py +++ b/ait/commons/util/common.py @@ -8,6 +8,7 @@ INGEST_UPLOAD_AREA_PREFIX = 's3://org-hca-data-archive-upload-' + def gen_uuid(): return str(uuid.uuid4()) From a11bd5756ea5e706dd394e4af05ec42c1e51a4dd Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 25 Apr 2024 12:20:35 +0100 Subject: [PATCH 05/55] call the dev API and not localhost and increment version --- ait/commons/util/command/submit.py | 2 +- ait/commons/util/settings/morphic_util.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index a9b5807..0b7b7cf 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -12,7 +12,7 @@ def get_id_from_url(url): class CmdSubmit: - base_url = 'http://localhost:8080' + base_url = 'https://api.ingest.dev.archive.morphic.bio/' def __init__(self, args): self.args = args diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 4485cea..47b6a2c 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,9 +1,10 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.9' +VERSION = '0.0.10' DESC = 'CLI tool for uploading data to Morphic AWS S3 bucket' AUTHOR = 'morphic-bio-dev' +AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' # when true, displays exception details; otherwise user-friendly error message DEBUG_MODE = False From 4ddf532779711460be56baf93db9580b07ad0d08 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 26 Apr 2024 13:06:09 +0100 Subject: [PATCH 06/55] fixing list when there are files and folders both. --- ait/commons/util/command/list.py | 123 ++++++++++--------------------- 1 file changed, 39 insertions(+), 84 deletions(-) diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index f6849d4..31788e0 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -2,6 +2,18 @@ from ait.commons.util.local_state import get_selected_area +def print_area(k, area): + print(k, end=' ') + p = '' + if 'perms' in area: + p = area.get('perms') or '' + print(p.ljust(3), end=' ') + if 'name' in area: + n = area.get('name') + print(f'{n}' if n else '', end=' ') + print() + + class CmdList: """ admin and user @@ -15,98 +27,41 @@ def __init__(self, aws, args): self.s3_cli = self.aws.common_session.client('s3') def run(self): + selected_area = get_selected_area() + + if not selected_area: + return False, 'No area selected' - if self.args.b: # list all areas in bucket - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' - - try: - folder_count = 0 - for area in self.list_bucket_areas(): - k = area["key"] - self.print_area(k, area) - folder_count += 1 - print_count(folder_count) - return True, None - - except Exception as e: - return False, format_err(e, 'list') - - else: # list selected area contents - selected_area = get_selected_area() - - if not selected_area: - return False, 'No area selected' - - try: - # selected_area += '' if selected_area.endswith('/') else '/' - n, p = self.get_name_and_perms(selected_area) - self.print_area(selected_area, dict(name=n, perms=p)) - - file_count = 0 - for k in self.list_area_contents(selected_area): - print(k) - if not k.endswith('/'): - file_count += 1 - - print_count(file_count) - return True, None - except Exception as e: - return False, format_err(e, 'list') - - def print_area(self, k, area): - print(k, end=' ') - p = '' - if 'perms' in area: - p = area.get('perms') or '' - print(p.ljust(3), end=' ') - if 'name' in area: - n = area.get('name') - print(f'{n}' if n else '', end=' ') - print() - - def get_name_and_perms(self, k): - n, p = None, None try: - tagSet = self.s3_cli.get_object_tagging(Bucket=self.aws.bucket_name, Key=k) - - if tagSet and tagSet['TagSet']: - kv = dict((tag['Key'], tag['Value']) for tag in tagSet['TagSet']) - n = kv.get('name', None) - p = kv.get('perms', None) - else: # for backward compatibility get name and perms from metadata - if not self.aws.is_user: # only admin can retrieve metadata (head_object) - resp = self.s3_cli.head_object(Bucket=self.aws.bucket_name, Key=k) - if resp and resp['Metadata']: - meta = resp['Metadata'] - n = meta.get('name', None) - p = meta.get('perms', None) - except: - pass - return n, p - - def list_bucket_areas(self): + # selected_area += '' if selected_area.endswith('/') else '/' + folder_count = 0 + for area in self.list_bucket_areas(selected_area): + k = area["key"] + print_area(k, area) + folder_count += 1 + print_count(folder_count) + return True, None + + except Exception as e: + return False, format_err(e, 'list') + + def list_bucket_areas(self, selected_area): areas = [] - result = self.s3_cli.list_objects_v2(Bucket=self.aws.bucket_name, Delimiter='/') + result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/') + + # Folders dirs = result.get('CommonPrefixes', []) for d in dirs: k = d.get('Prefix') - n, p = self.get_name_and_perms(k) - areas.append(dict(key=k, name=n, perms=p)) - return areas - - def list_area_contents(self, selected_area): - contents = [] + areas.append({'key': k}) - s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(selected_area) + # Files + files = result.get('Contents', []) + for f in files: + k = f.get('Key') + areas.append({'key': k}) - for obj in bucket.objects.filter(Prefix=selected_area): - k = obj.key - if k != selected_area: - contents.append(k) - - return contents + return areas def print_count(count): From 7795606293d37657afed7e865a45a2f78f2218b8 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 29 Apr 2024 17:12:18 -0700 Subject: [PATCH 07/55] fixing delete --- ait/commons/util/command/delete.py | 95 +++++------------------------- ait/commons/util/command/submit.py | 6 +- 2 files changed, 17 insertions(+), 84 deletions(-) diff --git a/ait/commons/util/command/delete.py b/ait/commons/util/command/delete.py index 637ca49..d377b64 100644 --- a/ait/commons/util/command/delete.py +++ b/ait/commons/util/command/delete.py @@ -1,8 +1,3 @@ -import json - -from botocore.exceptions import ClientError - -from ait.commons.util.command.area import CmdArea from ait.commons.util.common import format_err from ait.commons.util.local_state import get_selected_area @@ -13,6 +8,7 @@ ''' + class CmdDelete: """ both admin and user, though user can't delete folder @@ -31,31 +27,10 @@ def run(self): return False, 'No area selected' try: - if self.args.d: # delete area - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' - - confirm = input(f'Confirm delete upload area {selected_area}? Y/y to proceed: ') - - if confirm.lower() == 'y': - print('Deleting...') - - deleted_keys = self.delete_upload_area(selected_area, incl_selected_area=True) - for k in deleted_keys: - print(k) - - # delete bucket policy for user-folder permissions - # only admin who has perms to set policy can do this - self.clear_area_perms_from_bucket_policy(selected_area) - - # clear selected area - CmdArea.clear(False) - return True, None - if self.args.a: # delete all files - + confirm = input(f'Confirm delete all contents from {selected_area}? Y/y to proceed: ') - + if confirm.lower() == 'y': print('Deleting...') @@ -71,13 +46,13 @@ def run(self): # you may have perm x but not d (to load or even do a head object) # so use obj_exists - prefix = selected_area + p - keys = self.all_keys(prefix) + prefix = p + keys = self.all_keys(selected_area, prefix) if keys: for k in keys: try: - self.delete_s3_object(k) + self.delete_s3_object(selected_area, k) print(k + ' Done.') except Exception as ex: if 'AccessDenied' in str(ex): @@ -94,73 +69,31 @@ def run(self): return False, format_err(e, 'delete') # based on obj_exists method - def all_keys(self, prefix): + def all_keys(self, selected_area, prefix): keys = [] response = self.aws.common_session.client('s3').list_objects_v2( - Bucket=self.aws.bucket_name, + Bucket=selected_area, Prefix=prefix, ) for obj in response.get('Contents', []): keys.append(obj['Key']) - + return keys - def delete_s3_object(self, key): + def delete_s3_object(self, selected_area, key): s3_resource = self.aws.common_session.resource('s3') - s3_obj = s3_resource.ObjectSummary(self.aws.bucket_name, key) + s3_obj = s3_resource.ObjectSummary(selected_area, key) s3_obj.delete() return key def delete_upload_area(self, selected_area, incl_selected_area=False): s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) + bucket = s3_resource.Bucket(selected_area) deleted_keys = [] - objs_to_delete = bucket.objects.filter(Prefix=selected_area) if incl_selected_area else filter(lambda obj: obj.key != selected_area, bucket.objects.filter(Prefix=selected_area)) + objs_to_delete = bucket.objects.filter() if incl_selected_area else filter( + lambda obj: obj.key != selected_area, bucket.objects.filter()) for obj in objs_to_delete: obj.delete() deleted_keys.append(obj.key) return deleted_keys - - def clear_area_perms_from_bucket_policy(self, selected_area): - s3_resource = self.aws.common_session.resource('s3') - return CmdDelete.delete_dir_perms_from_bucket_policy(s3_resource, self.aws.bucket_name, selected_area) - - @staticmethod - def delete_dir_perms_from_bucket_policy(s3_res, bucket_name, area_name): - bucket_policy = s3_res.BucketPolicy(bucket_name) - try: - policy_str = bucket_policy.policy # throws NoSuchBucketPolicy - except ClientError: - policy_str = '' - - if policy_str: - policy = json.loads(policy_str) - policy_updated = False - - # remove any statement affecting single resource - # (this also maintains backward compatibility with the previous way of adding - # a statement per upload area) - for stmt in policy['Statement']: - - if isinstance(stmt['Resource'], str) and area_name in stmt['Resource']: - policy_updated = True - policy['Statement'].remove(stmt) # cannot modify if removing item while iterating over list - - # now check statement with resource list - for stmt in policy['Statement']: - if isinstance(stmt['Resource'], list): - # remove resource from resource list of statement but not statement - for res in stmt['Resource']: - if area_name in res: - policy_updated = True - stmt['Resource'].remove(res) - - if policy_updated: - try: - if policy['Statement']: - bucket_policy.put(Policy=json.dumps(policy)) # throws MalformedPolicy (policy document exceeds the maximum allowed size of 20480 bytes) - else: - bucket_policy.delete() - except ClientError: - pass diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 0b7b7cf..87f1ffd 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -47,9 +47,9 @@ def run(self): study_id = input("Input study id: ").lower() self.link_dataset_study(dataset_id, study_id) return True, dataset_id - else: - print("Unsupported type") - return False, "Unsupported type" + else: + print("Unsupported type") + return False, "Unsupported type" def create_dataset(self, submission_envelope_create_url): dataset_create_url = self.post(submission_envelope_create_url, 'datasets') From 9e89a39f6de51dce31da99f3c8bac8c10bfc6be5 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 13 May 2024 14:50:51 +0100 Subject: [PATCH 08/55] fix list --- ait/commons/util/aws_client.py | 6 +- ait/commons/util/aws_cognito_authenticator.py | 2 +- ait/commons/util/cmd.py | 4 +- ait/commons/util/command/config.py | 1 + ait/commons/util/command/create.py | 80 ++----------------- ait/commons/util/command/delete.py | 10 ++- ait/commons/util/command/download.py | 2 +- ait/commons/util/command/list.py | 27 +++---- 8 files changed, 31 insertions(+), 101 deletions(-) diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 18ad5b9..60deab1 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -15,7 +15,7 @@ def static_bucket_name(): class Aws: def __init__(self, user_profile): - self.is_user = False # not admin + self.is_user = True # not admin self.user_dir_list = None self.center_name = None self.secret_key = None @@ -55,7 +55,8 @@ def new_session(self): if secret_manager_client is None: print( - 'Failure while re-establishing Amazon Web Services session, report this error to the MorPhiC DRACC admin') + 'Failure while re-establishing Amazon Web Services session, report this error to the MorPhiC DRACC ' + 'admin') raise Exception else: self.is_user = aws_cognito_authenticator.is_user @@ -119,4 +120,3 @@ def data_file_exists(self, bucket_name, key): return True except client.exceptions.ClientError: return False - diff --git a/ait/commons/util/aws_cognito_authenticator.py b/ait/commons/util/aws_cognito_authenticator.py index d31190e..3ae21e0 100644 --- a/ait/commons/util/aws_cognito_authenticator.py +++ b/ait/commons/util/aws_cognito_authenticator.py @@ -12,7 +12,7 @@ class AwsCognitoAuthenticator: def __init__(self, args): self.args = args - self.is_user = False # not admin + self.is_user = True # not admin self.user_dir_list = None self.center_name = None # custom attribute DPC diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index c0548a9..abebddf 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -5,7 +5,7 @@ from ait.commons.util.aws_client import Aws, static_bucket_name from ait.commons.util.command.config import CmdConfig -from ait.commons.util.command.create import CmdCreate +from ait.commons.util.command.create import run from ait.commons.util.command.delete import CmdDelete from ait.commons.util.command.download import CmdDownload from ait.commons.util.command.list import CmdList @@ -87,7 +87,7 @@ def check_version(self): def execute(self, args): if args.command == 'create': - success, msg = CmdCreate(self.aws, args).run() + success, msg = run() self.exit(success, msg) elif args.command == 'select': diff --git a/ait/commons/util/command/config.py b/ait/commons/util/command/config.py index 9eeb73b..87119e4 100755 --- a/ait/commons/util/command/config.py +++ b/ait/commons/util/command/config.py @@ -21,6 +21,7 @@ def run(self): profile = self.args.profile if self.args.profile else DEFAULT_PROFILE aws_cognito_authenticator = AwsCognitoAuthenticator(self) + # TODO: review the below bucket in args if self.args.bucket: set_bucket(self.args.bucket) diff --git a/ait/commons/util/command/create.py b/ait/commons/util/command/create.py index e1e7c66..7a0bf9b 100644 --- a/ait/commons/util/command/create.py +++ b/ait/commons/util/command/create.py @@ -1,10 +1,10 @@ -import json +from ait.commons.util.aws_client import Aws -from botocore.exceptions import ClientError -from ait.commons.util.aws_client import Aws -from ait.commons.util.bucket_policy import DEFAULT_PERMS, allowDownloadStmt, denyDeleteStmt -from ait.commons.util.common import format_err +# TODO: review +def run(): + return False, ('create is no longer supported as upload areas (buckets) ' + 'are created while metadata submission') class CmdCreate: @@ -16,73 +16,3 @@ class CmdCreate: def __init__(self, aws: Aws, args): self.aws = aws self.args = args - - def run(self): - if not self.aws: - return False, 'You need configure your profile first' - - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' - - area_name = self.args.NAME # S3 bucket folder name - perms = self.args.p # optional str, default 'ux' - center_name = self.args.DPC # morphic DPC - - try: - s3_client = self.aws.common_session.client('s3') - # new upload areas to be created with tagging instead of metadata - # upload area format - morphic-DPC/area_name/ - s3_client.put_object(Bucket=self.aws.bucket_name, - Key=(area_name + '/'), - Tagging=f'name={area_name}&perms={perms}') - - if perms == DEFAULT_PERMS: - pass # default perms as set in user policy (ux) applies - no need for further actions (deny or allow) - else: - # get bucket policy - bucket_policy = self.aws.common_session.resource('s3').BucketPolicy(self.aws.bucket_name) - try: - policy_str = bucket_policy.policy - except ClientError: - policy_str = '' - - if policy_str: - policy_json = json.loads(policy_str) - else: # no bucket policy - policy_json = json.loads('{ "Version": "2012-10-17", "Statement": [] }') - - allow_stmt = None - deny_stmt = None - - for stmt in policy_json['Statement']: - if stmt['Sid'] == 'AllowDownload': - allow_stmt = stmt - elif stmt['Sid'] == 'DenyDelete': - deny_stmt = stmt - - if 'd' in perms: # e.g 'ud' or 'udx' - # allow download - self.update_perms(policy_json, allow_stmt, allowDownloadStmt(), area_name) - - if 'x' not in perms: # e.g. 'u' or 'ud' - # deny delete - self.update_perms(policy_json, deny_stmt, denyDeleteStmt(), area_name) - - try: - bucket_policy.put(Policy=json.dumps(policy_json)) - except ClientError: - pass - - return True, 'Created upload area with name ' + area_name + ' for ' + center_name + ' DPC' - - except Exception as e: - return False, format_err(e, 'create') - - def update_perms(self, policy, stmt, template, area): - if not stmt: - stmt = template - policy['Statement'].append(stmt) - if isinstance(stmt['Resource'], str): - stmt['Resource'] = [stmt['Resource']] + [f'arn:aws:s3:::{self.aws.bucket_name}/{area}/*'] - elif isinstance(stmt['Resource'], list): - stmt['Resource'].append(f'arn:aws:s3:::{self.aws.bucket_name}/{area}/*') diff --git a/ait/commons/util/command/delete.py b/ait/commons/util/command/delete.py index d377b64..426fa61 100644 --- a/ait/commons/util/command/delete.py +++ b/ait/commons/util/command/delete.py @@ -34,7 +34,8 @@ def run(self): if confirm.lower() == 'y': print('Deleting...') - deleted_keys = self.delete_upload_area(selected_area, incl_selected_area=False) + deleted_keys = self.delete_all_files_from_s3_bucket(selected_area, incl_selected_area=False) + for k in deleted_keys: print(k) @@ -42,6 +43,7 @@ def run(self): if self.args.PATH: # list of files and dirs to delete print('Deleting...') + for p in self.args.PATH: # you may have perm x but not d (to load or even do a head object) # so use obj_exists @@ -52,7 +54,7 @@ def run(self): if keys: for k in keys: try: - self.delete_s3_object(selected_area, k) + self.delete_singe_file_from_s3_bucket(selected_area, k) print(k + ' Done.') except Exception as ex: if 'AccessDenied' in str(ex): @@ -80,13 +82,13 @@ def all_keys(self, selected_area, prefix): return keys - def delete_s3_object(self, selected_area, key): + def delete_singe_file_from_s3_bucket(self, selected_area, key): s3_resource = self.aws.common_session.resource('s3') s3_obj = s3_resource.ObjectSummary(selected_area, key) s3_obj.delete() return key - def delete_upload_area(self, selected_area, incl_selected_area=False): + def delete_all_files_from_s3_bucket(self, selected_area, incl_selected_area=False): s3_resource = self.aws.common_session.resource('s3') bucket = s3_resource.Bucket(selected_area) deleted_keys = [] diff --git a/ait/commons/util/command/download.py b/ait/commons/util/command/download.py index 983dbaf..f537ff8 100755 --- a/ait/commons/util/command/download.py +++ b/ait/commons/util/command/download.py @@ -30,7 +30,7 @@ def run(self): try: s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) + bucket = s3_resource.Bucket(selected_area) # choice 1 all_files = self.args.a # optional bool diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index 31788e0..c3d0e7e 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -5,9 +5,11 @@ def print_area(k, area): print(k, end=' ') p = '' + if 'perms' in area: p = area.get('perms') or '' print(p.ljust(3), end=' ') + if 'name' in area: n = area.get('name') print(f'{n}' if n else '', end=' ') @@ -27,41 +29,36 @@ def __init__(self, aws, args): self.s3_cli = self.aws.common_session.client('s3') def run(self): - selected_area = get_selected_area() + selected_area = get_selected_area() # select area is a S3 bucket if not selected_area: return False, 'No area selected' try: - # selected_area += '' if selected_area.endswith('/') else '/' - folder_count = 0 - for area in self.list_bucket_areas(selected_area): - k = area["key"] - print_area(k, area) - folder_count += 1 - print_count(folder_count) + self.list_bucket_contents(selected_area) + # print_count(folder_count + files_count) return True, None except Exception as e: return False, format_err(e, 'list') - def list_bucket_areas(self, selected_area): - areas = [] - result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/') + def list_bucket_contents(self, selected_area, prefix=''): + result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) # Folders dirs = result.get('CommonPrefixes', []) + for d in dirs: k = d.get('Prefix') - areas.append({'key': k}) + print_area(k, {'key': k, 'perms': 'dir'}) + self.list_bucket_contents(selected_area, prefix=k) # Files files = result.get('Contents', []) + for f in files: k = f.get('Key') - areas.append({'key': k}) - - return areas + print_area(k, {'key': k, 'perms': 'file'}) def print_count(count): From 2734d037b53f43a309c295089a3a5d87381ea16a Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 15 May 2024 10:22:20 +0100 Subject: [PATCH 09/55] tsv and csv submission support --- ait/commons/util/command/delete.py | 4 ++-- ait/commons/util/command/submit.py | 37 ++++++++++++++++++++++++++---- requirements.txt | 4 +++- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/ait/commons/util/command/delete.py b/ait/commons/util/command/delete.py index 426fa61..f9aeff2 100644 --- a/ait/commons/util/command/delete.py +++ b/ait/commons/util/command/delete.py @@ -54,7 +54,7 @@ def run(self): if keys: for k in keys: try: - self.delete_singe_file_from_s3_bucket(selected_area, k) +r self.delete_single_file_from_s3_bucket(selected_area, k) print(k + ' Done.') except Exception as ex: if 'AccessDenied' in str(ex): @@ -82,7 +82,7 @@ def all_keys(self, selected_area, prefix): return keys - def delete_singe_file_from_s3_bucket(self, selected_area, key): + def delete_single_file_from_s3_bucket(self, selected_area, key): s3_resource = self.aws.common_session.resource('s3') s3_obj = s3_resource.ObjectSummary(selected_area, key) s3_obj.delete() diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 87f1ffd..9f6db55 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -1,5 +1,8 @@ +import csv + import requests import json +import pandas as pd from urllib.parse import urlparse from ait.commons.util.user_profile import get_profile @@ -18,7 +21,6 @@ def __init__(self, args): self.args = args self.access_token = get_profile('morphic-util').access_token self.type = self.args.type - self.data = None def run(self): submission_envelope_create_url = f"{self.base_url}/submissionEnvelopes/updateSubmissions" @@ -80,21 +82,46 @@ def link_dataset_study(self, dataset_id, study_id): def post(self, url, data_type_in_hal_link): # Read content of the file if self.args.file: - with open(self.args.file, 'r') as file: - self.data = json.load(file) + data = self.transform() else: - self.data = {} + data = {} headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.access_token}' } - response = requests.post(url, headers=headers, json=self.data) + response = requests.post(url, headers=headers, json=data) response_data = response.json() url = response_data['_links'][data_type_in_hal_link]['href'] return url + def transform(self): + if self.args.file.endswith('.tsv'): + # Read TSV file and convert to JSON + json_data = [] + + with open(self.args.file, 'r', newline='') as file: + reader = csv.DictReader(file, delimiter='\t') + for row in reader: + json_data.append(row) + + # Ensure JSON data is properly formatted + json_data_formatted = {'content': json_data} + + # Assign formatted JSON data to self.data + data = json_data_formatted + elif self.args.file.endswith('.csv'): + # Read CSV file and convert to JSON + df = pd.read_csv(self.args.file) + data = {'content': df.to_dict(orient='records')} + else: + # Read JSON file + with open(self.args.file, 'r') as file: + data = json.load(file) + + return data + def put(self, url, data_type_in_hal_link): headers = { 'Content-Type': 'application/json', diff --git a/requirements.txt b/requirements.txt index 28d6373..c4c9135 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ botocore>=1.26.10 filetype==1.0.7 requests>=2.20.0, <3 urllib3<1.27, >=1.25.4 -tqdm \ No newline at end of file +tqdm~=4.64.1 +pandas~=1.1.5 +setuptools~=59.6.0 \ No newline at end of file From db1eaa8b2d0edce23ee170dbba8f3ecc99a24e39 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 15 May 2024 10:23:26 +0100 Subject: [PATCH 10/55] typo in delete --- ait/commons/util/command/delete.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/command/delete.py b/ait/commons/util/command/delete.py index f9aeff2..4f139e3 100644 --- a/ait/commons/util/command/delete.py +++ b/ait/commons/util/command/delete.py @@ -54,7 +54,7 @@ def run(self): if keys: for k in keys: try: -r self.delete_single_file_from_s3_bucket(selected_area, k) + self.delete_single_file_from_s3_bucket(selected_area, k) print(k + ' Done.') except Exception as ex: if 'AccessDenied' in str(ex): From 41a916a07e26f558b099397759f426379d6a7b3f Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 11 Jun 2024 14:29:01 +0100 Subject: [PATCH 11/55] submission support added --- ait/commons/util/__main__.py | 2 + ait/commons/util/command/submit.py | 292 ++++++++++++++---- ait/commons/util/tests/command/test_create.py | 14 +- 3 files changed, 243 insertions(+), 65 deletions(-) diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index f4ba974..0b055d2 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -83,6 +83,8 @@ def parse_args(args): parser_config.add_argument('--type', help='data type you are submitting, e.g. study, dataset') parser_config.add_argument('--file', help='your metadata') parser_config.add_argument('--study', help='your study reference') + parser_config.add_argument('--dataset', help='your dataset reference') + parser_config.add_argument('--process', help='your process/analysis reference') parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 9f6db55..4fb59aa 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -1,5 +1,4 @@ import csv - import requests import json import pandas as pd @@ -9,78 +8,234 @@ def get_id_from_url(url): + """ + Extracts and returns the ID from a given URL. + + Parameters: + url (str): The URL string. + + Returns: + str: The ID extracted from the URL. + """ parsed_url = urlparse(url) path_parts = parsed_url.path.split('/') return path_parts[2] +def get_id(url): + """ + Extracts and returns the ID from a URL. + + Parameters: + url (str): The URL string. + + Returns: + str: The extracted ID or None if an error occurs. + """ + try: + id = url.split('/')[-1] + return id + except Exception as e: + print(f"Error encountered: {e}") + return None + + class CmdSubmit: - base_url = 'https://api.ingest.dev.archive.morphic.bio/' + """ + A class to handle submission of studies, datasets, and biomaterials to a server. + + Attributes: + base_url (str): The base URL for the server. + submission_envelope_create_url (str): URL for creating submission envelopes. + submission_envelope_base_url (str): Base URL for submission envelopes. + args (Namespace): Command-line arguments. + access_token (str): Access token for authorization. + type (str): Type of submission (study, dataset, or biomaterial). + + Methods: + run(): Executes the submission process based on the type. + create_dataset(): Creates a dataset and returns its ID. + create_biomaterial(): Creates a biomaterial and returns its ID. + create_study(): Creates a study and returns its ID. + link_dataset_study(dataset_id, study_id): Links a dataset to a study. + link_biomaterial_dataset(biomaterial_id, dataset_id): Links a biomaterial to a dataset. + get_id(url): Extracts and returns the ID from a URL. + post(url, data_type_in_hal_link): Sends a POST request to the specified URL. + transform(): Transforms the input file to a JSON object. + put(url): Sends a PUT request to the specified URL. + """ + base_url = 'http://localhost:8080' + submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" + submission_envelope_base_url = f"{base_url}/submissionEnvelopes" def __init__(self, args): + """ + Initializes the CmdSubmit class with command-line arguments. + + Parameters: + args (Namespace): Command-line arguments. + """ self.args = args self.access_token = get_profile('morphic-util').access_token self.type = self.args.type def run(self): - submission_envelope_create_url = f"{self.base_url}/submissionEnvelopes/updateSubmissions" - - if self.type == 'study': - study_id = self.create_study(submission_envelope_create_url) - - return True, study_id - elif self.type == 'dataset': - dataset_id = self.create_dataset(submission_envelope_create_url) + """ + Executes the submission process based on the type of submission. - print("Dataset created successfully: " + dataset_id) + Returns: + tuple: A tuple containing a boolean indicating success and the ID of the created entity. + """ + if self.type in ['study', 'dataset', 'biomaterial', 'process']: + entity_id = self.create_entity(self.type) + if entity_id is not None: + if self.type == 'dataset': + if self.args.study is not None: + study_id = self.args.study + self.link_dataset_study(entity_id, study_id) + else: + link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() + if link_to_study == 'yes': + study_id = input("Input study id: ").lower() + self.link_dataset_study(entity_id, study_id) + elif self.type == 'biomaterial': + if self.args.dataset is not None: + dataset_id = self.args.dataset + self.link_biomaterial_dataset(entity_id, dataset_id) + else: + link_to_dataset = input("Do you want to link this biomaterial to a dataset? (yes/no): ").lower() + if link_to_dataset == 'yes': + dataset_id = input("Input dataset id: ").lower() + self.link_biomaterial_dataset(entity_id, dataset_id) - # Check if dataset ID is available - if dataset_id is not None: - # Check if user has passed study to link, else prompt - if self.args.study is not None: - study_id = self.args.study + # Linking biomaterial to process + if self.args.process is not None: + process_id = self.args.process + self.link_biomaterial_process(entity_id, process_id) - self.link_dataset_study(dataset_id, study_id) - # Prompt user to link dataset to study - else: - link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() - - if link_to_study == 'yes': - study_id = input("Input study id: ").lower() - self.link_dataset_study(dataset_id, study_id) - return True, dataset_id + return True, entity_id else: print("Unsupported type") return False, "Unsupported type" - def create_dataset(self, submission_envelope_create_url): - dataset_create_url = self.post(submission_envelope_create_url, 'datasets') - submission_envelope_id = get_id_from_url(dataset_create_url) - dataset_create_response = self.post(dataset_create_url, 'submissionEnvelopes') - link_dataset_to_submission_envelope_response = self.put( - dataset_create_response + '/' + submission_envelope_id, - 'self') - dataset_id = get_id_from_url(link_dataset_to_submission_envelope_response) + def create_entity(self, input_entity_type): + """ + Creates an entity (study, dataset, biomaterial, or process) and returns its ID. + + Parameters: + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + + Returns: + str: The ID of the created entity. + """ + if input_entity_type == 'study': + entity = 'studies' + elif input_entity_type == 'dataset': + entity = 'datasets' + elif input_entity_type == 'biomaterial': + entity = 'biomaterials' + elif input_entity_type == 'process': + entity = 'processes' + + entity_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, + entity) + entity_self_hal_link = self.post(entity_create_url_from_sub_env_hal_links, 'self') + entity_id = get_id_from_url(entity_self_hal_link) + print(f"{input_entity_type.capitalize()} created successfully: " + entity_id) + return entity_id + + def create_dataset(self): + """ + Creates a dataset and returns its ID. + + Returns: + str: The ID of the created dataset. + """ + dataset_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'datasets') + dataset_self_hal_link = self.post(dataset_create_url_from_sub_env_hal_links, 'self') + dataset_id = get_id(dataset_self_hal_link) + print("Dataset created successfully: " + dataset_id) return dataset_id - def create_study(self, submission_envelope_create_url): - study_create_url = self.post(submission_envelope_create_url, 'studies') - submission_envelope_id = get_id_from_url(study_create_url) - study_create_response = self.post(study_create_url, 'submissionEnvelopes') - link_study_to_submission_envelope_response = self.put(study_create_response + '/' + submission_envelope_id, - 'self') - study_id = get_id_from_url(link_study_to_submission_envelope_response) + def create_process(self): + """ + Creates a process and returns its ID. + + Returns: + str: The ID of the created process. + """ + process_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'processes') + process_self_hal_link = self.post(process_create_url_from_sub_env_hal_links, 'self') + process_id = get_id(process_self_hal_link) + print("Process created successfully: " + process_id) + return process_id + + def create_biomaterial(self): + """ + Creates a biomaterial and returns its ID. + + Returns: + str: The ID of the created biomaterial. + """ + biomaterial_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'biomaterials') + biomaterial_self_hal_link = self.post(biomaterial_create_url_from_sub_env_hal_links, 'self') + biomaterial_id = get_id(biomaterial_self_hal_link) + print("Biomaterial created successfully: " + biomaterial_id) + return biomaterial_id + + def create_study(self): + """ + Creates a study and returns its ID. + + Returns: + str: The ID of the created study. + """ + study_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'studies') + study_self_hal_link = self.post(study_create_url_from_sub_env_hal_links, 'self') + study_id = get_id(study_self_hal_link) print("Study created successfully: " + study_id) return study_id def link_dataset_study(self, dataset_id, study_id): + """ + Links a dataset to a study. + + Parameters: + dataset_id (str): The ID of the dataset. + study_id (str): The ID of the study. + """ print("Linking dataset " + dataset_id + " to study " + study_id) - # Perform the linking operation here - self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", None) + self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}") print("Dataset linked successfully to study: " + study_id) + def link_biomaterial_dataset(self, biomaterial_id, dataset_id): + """ + Links a biomaterial to a dataset. + + Parameters: + biomaterial_id (str): The ID of the biomaterial. + dataset_id (str): The ID of the dataset. + """ + print("Linking biomaterial " + biomaterial_id + " to dataset " + dataset_id) + self.put(f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}") + print("Biomaterial linked successfully to dataset: " + dataset_id) + + def link_biomaterial_process(self, biomaterial_id, process_id): + print("Linking biomaterial " + biomaterial_id + " to process " + process_id) + self.post_to_link(f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", + process_id, 'biomaterials', 'processes') + def post(self, url, data_type_in_hal_link): - # Read content of the file + """ + Sends a POST request to the specified URL. + + Parameters: + url (str): The URL to send the request to. + data_type_in_hal_link (str): The data type in the HAL link. + + Returns: + str: The URL from the response. + """ if self.args.file: data = self.transform() else: @@ -90,46 +245,67 @@ def post(self, url, data_type_in_hal_link): 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.access_token}' } + response = requests.post(url, headers=headers, json=data) response_data = response.json() url = response_data['_links'][data_type_in_hal_link]['href'] - return url + def post_to_link(self, url, input_id, link_this, link_to): + """ + Sends a POST request to the specified URL. + + Parameters: + + Returns: + """ + headers = { + 'Content-Type': 'text/uri-list', + 'Authorization': f'Bearer {self.access_token}' + } + + response = requests.post(url, headers=headers, + data=f"{self.base_url}/{link_to}/{input_id}/{link_this}") + + return response.json() + def transform(self): + """ + Transforms the input file to a JSON object. + + Returns: + dict: The JSON object. + """ if self.args.file.endswith('.tsv'): - # Read TSV file and convert to JSON json_data = [] - with open(self.args.file, 'r', newline='') as file: reader = csv.DictReader(file, delimiter='\t') for row in reader: json_data.append(row) - - # Ensure JSON data is properly formatted json_data_formatted = {'content': json_data} - - # Assign formatted JSON data to self.data data = json_data_formatted elif self.args.file.endswith('.csv'): - # Read CSV file and convert to JSON df = pd.read_csv(self.args.file) data = {'content': df.to_dict(orient='records')} else: - # Read JSON file with open(self.args.file, 'r') as file: data = json.load(file) - return data - def put(self, url, data_type_in_hal_link): + def put(self, url): + """ + Sends a PUT request to the specified URL. + + Parameters: + url (str): The URL to send the request to. + + Returns: + dict: The response data. + """ headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.access_token}' } response = requests.put(url, headers=headers) response_data = response.json() - - if data_type_in_hal_link is not None: - url = response_data['_links'][data_type_in_hal_link]['href'] - return url + return response_data diff --git a/ait/commons/util/tests/command/test_create.py b/ait/commons/util/tests/command/test_create.py index b389528..3a47a2a 100644 --- a/ait/commons/util/tests/command/test_create.py +++ b/ait/commons/util/tests/command/test_create.py @@ -3,7 +3,7 @@ from ait.commons.util.__main__ import parse_args from ait.commons.util.cmd import Cmd -from ait.commons.util.command.create import CmdCreate +from ait.commons.util.command.create import run class TestCreate(unittest.TestCase): @@ -32,7 +32,7 @@ def test_create_upload_area_no_config_display_error(self): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(None, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) @@ -56,7 +56,7 @@ def test_user_create_upload_area_has_valid_config(self): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) @@ -70,7 +70,7 @@ def test_admin_create_upload_area(self, uuid): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertTrue(success) @@ -88,7 +88,7 @@ def test_admin_create_upload_area_with_permissions(self, uuid): args = ['create', upload_area_name, '-p', permission] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertTrue(success) @@ -103,7 +103,7 @@ def test_admin_create_upload_area_no_name(self): # when with self.assertRaises(SystemExit) as error: parsed_args = parse_args(args) - success, msg = CmdCreate(self.aws_mock, parsed_args).run() + success, msg = run() self.assertFalse(parsed_args) self.assertFalse(success) self.assertFalse(msg) @@ -133,7 +133,7 @@ def test_admin_create_upload_area_has_exception(self, uuid): self.client.put_object.side_effect = Mock(side_effect=Exception('Test')) # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) From 4a22274a1976dc8da227531547fb8fc06406de5e Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 19 Jun 2024 15:40:34 +0100 Subject: [PATCH 12/55] metadata spreadsheet submission support added --- ait/commons/util/__main__.py | 3 + ait/commons/util/cmd.py | 5 + ait/commons/util/command/submit.py | 311 ++++++---- ait/commons/util/command/submit_file.py | 112 ++++ ait/commons/util/settings/morphic_util.py | 2 +- ait/commons/util/util/__init__.py | 0 ait/commons/util/util/spreadsheet_util.py | 667 ++++++++++++++++++++++ requirements.txt | 3 +- 8 files changed, 986 insertions(+), 117 deletions(-) create mode 100644 ait/commons/util/command/submit_file.py create mode 100644 ait/commons/util/util/__init__.py create mode 100644 ait/commons/util/util/spreadsheet_util.py diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 0b055d2..5e8a547 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -86,6 +86,9 @@ def parse_args(args): parser_config.add_argument('--dataset', help='your dataset reference') parser_config.add_argument('--process', help='your process/analysis reference') + parser_config = cmd_parser.add_parser('submit-file', help='submit your file containing your consolidated metadata') + parser_config.add_argument('--file', help='your metadata') + parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) parser_create.add_argument('DPC', help='center name of the submitter', type=valid_project_name) diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index abebddf..2df7fbc 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -11,6 +11,7 @@ from ait.commons.util.command.list import CmdList from ait.commons.util.command.select import CmdSelect from ait.commons.util.command.submit import CmdSubmit +from ait.commons.util.command.submit_file import CmdSubmitFile from ait.commons.util.command.sync import CmdSync from ait.commons.util.command.upload import CmdUpload from ait.commons.util.local_state import get_bucket, set_attr, get_attr @@ -42,6 +43,10 @@ def __init__(self, args): success, msg = CmdSubmit(args).run() print(msg) + elif args.command == 'submit-file': + CmdSubmitFile(args).run() + # print(msg) + else: if profile_exists(args.profile): self.user_profile = get_profile(args.profile) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 4fb59aa..453001d 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -3,7 +3,6 @@ import json import pandas as pd from urllib.parse import urlparse - from ait.commons.util.user_profile import get_profile @@ -22,24 +21,6 @@ def get_id_from_url(url): return path_parts[2] -def get_id(url): - """ - Extracts and returns the ID from a URL. - - Parameters: - url (str): The URL string. - - Returns: - str: The extracted ID or None if an error occurs. - """ - try: - id = url.split('/')[-1] - return id - except Exception as e: - print(f"Error encountered: {e}") - return None - - class CmdSubmit: """ A class to handle submission of studies, datasets, and biomaterials to a server. @@ -54,15 +35,18 @@ class CmdSubmit: Methods: run(): Executes the submission process based on the type. - create_dataset(): Creates a dataset and returns its ID. - create_biomaterial(): Creates a biomaterial and returns its ID. - create_study(): Creates a study and returns its ID. - link_dataset_study(dataset_id, study_id): Links a dataset to a study. - link_biomaterial_dataset(biomaterial_id, dataset_id): Links a biomaterial to a dataset. - get_id(url): Extracts and returns the ID from a URL. - post(url, data_type_in_hal_link): Sends a POST request to the specified URL. - transform(): Transforms the input file to a JSON object. - put(url): Sends a PUT request to the specified URL. + multi_type_submission(cell_lines, submission_envelope_id, access_token): Submits multiple cell lines. + typed_submission(type, file, access_token): Submits a single entity based on its type. + create_new_envelope_and_submit_entity(input_entity_type, data, access_token): Creates and submits a new entity. + use_existing_envelope_and_submit_entity(input_entity_type, data, submission_envelope_id, access_token): Submits an entity using an existing envelope. + link_dataset_to_study(dataset_id, study_id, access_token): Links a dataset to a study. + link_biomaterial_to_dataset(biomaterial_id, dataset_id, access_token): Links a biomaterial to a dataset. + link_biomaterial_to_process(biomaterial_id, process_id, access_token): Links a biomaterial to a process. + post_to_provider_api(url, data_type_in_hal_link, data, access_token): Sends a POST request to the provider API. + create_new_submission_envelope(url, access_token): Creates a new submission envelope. + perform_hal_linkage(url, input_id, link_this, link_to, access_token): Performs HAL linkage. + transform(file): Transforms the input file to a JSON object. + put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ base_url = 'http://localhost:8080' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" @@ -77,7 +61,8 @@ def __init__(self, args): """ self.args = args self.access_token = get_profile('morphic-util').access_token - self.type = self.args.type + self.type = getattr(self.args, 'type', None) + self.file = getattr(self.args, 'file', None) def run(self): """ @@ -86,182 +71,272 @@ def run(self): Returns: tuple: A tuple containing a boolean indicating success and the ID of the created entity. """ - if self.type in ['study', 'dataset', 'biomaterial', 'process']: - entity_id = self.create_entity(self.type) + return self.typed_submission(self.type, self.file, self.access_token) + + def multi_type_submission(self, cell_lines, submission_envelope_id, access_token): + """ + Submits multiple cell lines. + + Parameters: + cell_lines (list): List of cell line objects to be submitted. + submission_envelope_id (str): ID of the submission envelope. + access_token (str): Access token for authorization. + """ + for cell_line in cell_lines: + print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") + + # Create cell line biomaterial + cell_line_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', + cell_line.to_dict(), + submission_envelope_id, + access_token) + + if len(cell_line.differentiated_cell_lines) > 0: + # Create differentiation process + print("Cell line has differentiated cell lines, creating process to link them") + + differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity('process', + cell_line.to_dict(), + submission_envelope_id, + access_token) + + # Create a dictionary to store biomaterial_id to entity_id mappings for differentiated cell lines + differentiated_biomaterial_to_entity_id_map = {} + + for differentiated_cell_line in cell_line.differentiated_cell_lines: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id}") + + # Create differentiated cell line biomaterial + differentiated_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', + differentiated_cell_line.to_dict(), + submission_envelope_id, + access_token) + + # Update the mapping dictionary + differentiated_biomaterial_to_entity_id_map[ + differentiated_cell_line.biomaterial_id] = differentiated_entity_id + + def typed_submission(self, type, file, access_token): + """ + Submits a single entity based on its type. + + Parameters: + type (str): The type of entity to be submitted ('study', 'dataset', 'biomaterial', 'process'). + file (str): The file containing the data to be submitted. + access_token (str): Access token for authorization. + + Returns: + tuple: A tuple containing a boolean indicating success and the ID of the created entity. + """ + if type in ['study', 'dataset', 'biomaterial', 'process']: + if file is not None: + data = self.transform(file) + else: + data = {} + + entity_id = self.create_new_envelope_and_submit_entity(type, data, access_token) + if entity_id is not None: - if self.type == 'dataset': + if type == 'dataset': if self.args.study is not None: study_id = self.args.study - self.link_dataset_study(entity_id, study_id) + self.link_dataset_to_study(entity_id, study_id, access_token) else: link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() if link_to_study == 'yes': study_id = input("Input study id: ").lower() - self.link_dataset_study(entity_id, study_id) - elif self.type == 'biomaterial': + self.link_dataset_to_study(entity_id, study_id, access_token) + + elif type == 'biomaterial': if self.args.dataset is not None: dataset_id = self.args.dataset - self.link_biomaterial_dataset(entity_id, dataset_id) + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) else: link_to_dataset = input("Do you want to link this biomaterial to a dataset? (yes/no): ").lower() if link_to_dataset == 'yes': dataset_id = input("Input dataset id: ").lower() - self.link_biomaterial_dataset(entity_id, dataset_id) + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) # Linking biomaterial to process if self.args.process is not None: process_id = self.args.process - self.link_biomaterial_process(entity_id, process_id) + self.link_biomaterial_to_process(entity_id, process_id, access_token) return True, entity_id else: print("Unsupported type") return False, "Unsupported type" - def create_entity(self, input_entity_type): + def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_token): """ - Creates an entity (study, dataset, biomaterial, or process) and returns its ID. + Creates and submits a new entity (study, dataset, biomaterial, or process) and returns its ID. Parameters: input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + data (dict): The data to be submitted. + access_token (str): Access token for authorization. Returns: str: The ID of the created entity. """ if input_entity_type == 'study': - entity = 'studies' + halEntity = 'studies' elif input_entity_type == 'dataset': - entity = 'datasets' + halEntity = 'datasets' elif input_entity_type == 'biomaterial': - entity = 'biomaterials' + halEntity = 'biomaterials' elif input_entity_type == 'process': - entity = 'processes' + halEntity = 'processes' - entity_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, - entity) - entity_self_hal_link = self.post(entity_create_url_from_sub_env_hal_links, 'self') + entity_create_url_from_sub_env_hal_links = self.post_to_provider_api(self.submission_envelope_create_url, + halEntity, None, access_token) + entity_self_hal_link = self.post_to_provider_api(entity_create_url_from_sub_env_hal_links, + 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) + print(f"{input_entity_type.capitalize()} created successfully: " + entity_id) - return entity_id - def create_dataset(self): - """ - Creates a dataset and returns its ID. + return entity_id - Returns: - str: The ID of the created dataset. + def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submission_envelope_id, access_token): """ - dataset_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'datasets') - dataset_self_hal_link = self.post(dataset_create_url_from_sub_env_hal_links, 'self') - dataset_id = get_id(dataset_self_hal_link) - print("Dataset created successfully: " + dataset_id) - return dataset_id + Submits an entity using an existing submission envelope and returns its ID. - def create_process(self): - """ - Creates a process and returns its ID. + Parameters: + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + data (dict): The data to be submitted. + submission_envelope_id (str): ID of the submission envelope. + access_token (str): Access token for authorization. Returns: - str: The ID of the created process. - """ - process_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'processes') - process_self_hal_link = self.post(process_create_url_from_sub_env_hal_links, 'self') - process_id = get_id(process_self_hal_link) - print("Process created successfully: " + process_id) - return process_id - - def create_biomaterial(self): + str: The ID of the created entity. """ - Creates a biomaterial and returns its ID. + if input_entity_type == 'study': + halEntity = 'studies' + elif input_entity_type == 'dataset': + halEntity = 'datasets' + elif input_entity_type == 'biomaterial': + halEntity = 'biomaterials' + elif input_entity_type == 'process': + halEntity = 'processes' - Returns: - str: The ID of the created biomaterial. - """ - biomaterial_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'biomaterials') - biomaterial_self_hal_link = self.post(biomaterial_create_url_from_sub_env_hal_links, 'self') - biomaterial_id = get_id(biomaterial_self_hal_link) - print("Biomaterial created successfully: " + biomaterial_id) - return biomaterial_id + entity_create_url_from_sub_env_hal_links = (self.submission_envelope_base_url + + "/" + submission_envelope_id + + "/" + halEntity) + entity_self_hal_link = self.post_to_provider_api(entity_create_url_from_sub_env_hal_links, + 'self', data, access_token) + entity_id = get_id_from_url(entity_self_hal_link) - def create_study(self): - """ - Creates a study and returns its ID. + print(f"{input_entity_type.capitalize()} created successfully: " + entity_id) - Returns: - str: The ID of the created study. - """ - study_create_url_from_sub_env_hal_links = self.post(self.submission_envelope_create_url, 'studies') - study_self_hal_link = self.post(study_create_url_from_sub_env_hal_links, 'self') - study_id = get_id(study_self_hal_link) - print("Study created successfully: " + study_id) - return study_id + return entity_id - def link_dataset_study(self, dataset_id, study_id): + def link_dataset_to_study(self, dataset_id, study_id, access_token): """ Links a dataset to a study. Parameters: dataset_id (str): The ID of the dataset. study_id (str): The ID of the study. + access_token (str): Access token for authorization. """ print("Linking dataset " + dataset_id + " to study " + study_id) - self.put(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}") + + self.put_to_provider_api(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", access_token) + print("Dataset linked successfully to study: " + study_id) - def link_biomaterial_dataset(self, biomaterial_id, dataset_id): + def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ Links a biomaterial to a dataset. Parameters: biomaterial_id (str): The ID of the biomaterial. dataset_id (str): The ID of the dataset. + access_token (str): Access token for authorization. """ print("Linking biomaterial " + biomaterial_id + " to dataset " + dataset_id) - self.put(f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}") - print("Biomaterial linked successfully to dataset: " + dataset_id) - def link_biomaterial_process(self, biomaterial_id, process_id): + self.put_to_provider_api(f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}", access_token) + + print("Biosmaterial linked successfully to dataset: " + dataset_id) + + def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): + """ + Links a biomaterial to a process. + + Parameters: + biomaterial_id (str): The ID of the biomaterial. + process_id (str): The ID of the process. + access_token (str): Access token for authorization. + """ print("Linking biomaterial " + biomaterial_id + " to process " + process_id) - self.post_to_link(f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", - process_id, 'biomaterials', 'processes') - def post(self, url, data_type_in_hal_link): + self.perform_hal_linkage(f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", + process_id, 'biomaterials', 'processes', access_token) + + def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): """ Sends a POST request to the specified URL. Parameters: url (str): The URL to send the request to. data_type_in_hal_link (str): The data type in the HAL link. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. Returns: str: The URL from the response. """ - if self.args.file: - data = self.transform() - else: - data = {} - headers = { 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.access_token}' + 'Authorization': f'Bearer {access_token}' } response = requests.post(url, headers=headers, json=data) response_data = response.json() url = response_data['_links'][data_type_in_hal_link]['href'] + return url - def post_to_link(self, url, input_id, link_this, link_to): + def create_new_submission_envelope(self, url, access_token): """ - Sends a POST request to the specified URL. + Creates a new submission envelope. Parameters: + url (str): The URL to send the request to. + access_token (str): Access token for authorization. Returns: + dict: The response data. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json={}) + response_data = response.json() + + return response_data + + def perform_hal_linkage(self, url, input_id, link_this, link_to, access_token): + """ + Performs HAL linkage. + + Parameters: + url (str): The URL to send the request to. + input_id (str): The ID of the input entity. + link_this (str): The entity to link. + link_to (str): The entity to link to. + access_token (str): Access token for authorization. + + Returns: + dict: The response data. """ headers = { 'Content-Type': 'text/uri-list', - 'Authorization': f'Bearer {self.access_token}' + 'Authorization': f'Bearer {access_token}' } response = requests.post(url, headers=headers, @@ -269,43 +344,49 @@ def post_to_link(self, url, input_id, link_this, link_to): return response.json() - def transform(self): + def transform(self, file): """ Transforms the input file to a JSON object. + Parameters: + file (str): The file path. + Returns: dict: The JSON object. """ if self.args.file.endswith('.tsv'): json_data = [] - with open(self.args.file, 'r', newline='') as file: + with open(file, 'r', newline='') as file: reader = csv.DictReader(file, delimiter='\t') for row in reader: json_data.append(row) json_data_formatted = {'content': json_data} data = json_data_formatted - elif self.args.file.endswith('.csv'): - df = pd.read_csv(self.args.file) + elif file.endswith('.csv'): + df = pd.read_csv(file) data = {'content': df.to_dict(orient='records')} else: - with open(self.args.file, 'r') as file: + with open(file, 'r') as file: data = json.load(file) return data - def put(self, url): + def put_to_provider_api(self, url, access_token): """ Sends a PUT request to the specified URL. Parameters: url (str): The URL to send the request to. + access_token (str): Access token for authorization. Returns: dict: The response data. """ headers = { 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.access_token}' + 'Authorization': f'Bearer {access_token}' } + response = requests.put(url, headers=headers) response_data = response.json() + return response_data diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py new file mode 100644 index 0000000..2f7a2a0 --- /dev/null +++ b/ait/commons/util/command/submit_file.py @@ -0,0 +1,112 @@ +# Import necessary modules/classes from ait.commons.util package +from ait.commons.util.command.submit import CmdSubmit, get_id_from_url +from ait.commons.util.user_profile import get_profile +from ait.commons.util.util.spreadsheet_util import SpreadsheetSubmitter + + +# Define a class for handling submission of a command file +class CmdSubmitFile: + # Column mappings for parsing different sections of the spreadsheet + cellline_column_mapping = { + "CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", + "CELL LINE DESCRIPTION": "cell_line.biomaterial_core.biomaterial_description", + "DERIVED FROM CELL LINE NAME (Required)": "cell_line.derived_cell_line_accession", + "CLONE ID": "cell_line.clone_id", + "GENE EXPRESSION ALTERATION PROTOCOL ID": "gene_expression_alteration_protocol.protocol_core.protocol_id", + "ZYGOSITY": "cell_line.zygosity", + "CELL LINE TYPE (Required)": "cell_line.type", + "Unnamed: 7": None, + "Unnamed: 8": None + } + + differentiated_cellline_column_mapping = { + "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", + "DIFFERENTIATED CELL LINE DESCRIPTION": "differentiated_cell_line.biomaterial_core.biomaterial_description", + "INPUT CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", + "DIFFERENTIATION PROTOCOL ID (Required)": "differentiation_protocol.protocol_core.protocol_id", + "TIMEPOINT VALUE": "differentiated_cell_line.timepoint_value", + "TIMEPOINT UNIT": "differentiated_cell_line.timepoint_unit.text", + "TERMINALLY DIFFERENTIATED": "differentiated_cell_line.terminally_differentiated", + "FINAL LINEAGE STAGE": "differentiated_cell_line.terminally_differentiated", + "Model System": "cell_line.model_organ.text", + "MODEL SYSTEM": "cell_line.model_organ.text", + "Unnamed: 8": None + } + + library_preparation_column_mapping = { + "LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", + "LIBRARY PREPARATION PROTOCOL ID (Required)": "library_preparation_protocol.protocol_core.protocol_id", + "DISSOCIATION PROTOCOL ID (Required)": "dissociation_protocol.protocol_core.protocol_id", + "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", + "LIBRARY AVERAGE FRAGMENT SIZE": "library_preparation.average_fragment_size", + "LIBRARY INPUT AMOUNT VALUE": "library_preparation.input_amount_value", + "LIBRARY INPUT AMOUNT UNIT": "library_preparation.input_amount_unit", + "LIBRARY FINAL YIELD VALUE": "library_preparation.final_yield_value", + "LIBRARY FINAL YIELD UNIT": "library_preparation.final_yield_unit", + "LIBRARY CONCENTRATION VALUE": "library_preparation.concentration_value", + "LIBRARY CONCENTRATION UNIT": "library_preparation.concentration_unit", + "LIBRARY PCR CYCLES": "library_preparation.pcr_cycles", + "LIBRARY PCR CYCLES FOR SAMPLE INDEX": "library_preparation.pcr_cycles_for_sample_index", + "Unnamed: 14": None # Adjust index based on your actual column count + } + + sequencing_file_column_mapping = { + "FILE NAME (Required)": "sequence_file.file_core.file_name", + "INPUT LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", + "SEQUENCING PROTOCOL ID (Required)": "sequencing_protocol.protocol_core.protocol_id", + "READ INDEX (Required)": "sequence_file.read_index", + "RUN ID": "sequence_file.run_id", + "Unnamed: 5": None # Adjust index based on your actual column count + } + + base_url = 'http://localhost:8080' + submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" + submission_envelope_base_url = f"{base_url}/submissionEnvelopes" + + def __init__(self, args): + """ + Initialize CmdSubmitFile instance. + + Args: + args: Command-line arguments passed to the script. + """ + self.args = args + self.access_token = get_profile('morphic-util').access_token + + if hasattr(self.args, 'file') and self.args.file is not None: + self.file = self.args.file + else: + self.file = None + + def run(self): + """ + Execute the command file submission process. + """ + submission_instance = CmdSubmit(self) + + if self.file: + # Initialize SpreadsheetParser with the provided file path + parser = SpreadsheetSubmitter(self.file) + + # Parse different sections of the spreadsheet using defined column mappings + cell_lines = parser.get_cell_lines('Cell line ', self.cellline_column_mapping) + differentiated_cell_lines = parser.get_differentiated_cell_lines('Differentiated cell line', + self.differentiated_cellline_column_mapping) + parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) + library_preparations = parser.get_library_preparations('Library preparation', + self.library_preparation_column_mapping) + parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, + library_preparations) + sequencing_files = parser.get_sequencing_files('Sequence file', self.sequencing_file_column_mapping) + parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) + + # Print each CellLine object in CellLineMaster + submission_envelope_response = submission_instance.create_new_submission_envelope( + self.submission_envelope_create_url, + access_token=self.access_token) + self_url = submission_envelope_response['_links']['self']['href'] + submission_envelope_id = get_id_from_url(self_url) + + print("Submission envelope for this submission is: " + submission_envelope_id) + + submission_instance.multi_type_submission(cell_lines, submission_envelope_id, self.access_token) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 47b6a2c..07a226a 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -36,7 +36,7 @@ COGNITO_CLIENT_ID = '178j951qnfuheicm2m5rqqvg6q' COGNITO_IDENTITY_POOL_ID = 'eu-west-2:d6531e9c-020d-4ee8-bf3b-255393c500e9' COGNITO_USER_POOL_ID = 'eu-west-2_b4EyaLNCM' -IAM_USER = 'morphic-admin' +IAM_USER = 'morphic-dev-admin' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' AWS_SECRET_NAME_SK_BUCKET = 'SK-bucket' diff --git a/ait/commons/util/util/__init__.py b/ait/commons/util/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py new file mode 100644 index 0000000..13f1a3c --- /dev/null +++ b/ait/commons/util/util/spreadsheet_util.py @@ -0,0 +1,667 @@ +import pandas as pd +import json + + +class MissingMandatoryFieldError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +class MissingEntityError(Exception): + """Custom exception raised when an expected entity is missing.""" + + def __init__(self, missing_type, entity_type, missing_id): + super().__init__(f"Missing {missing_type} for {entity_type} and Id is {missing_id}") + self.entity_type = entity_type + self.missing_type = missing_type + self.missing_id = missing_id + + +class CellLine: + def __init__(self, biomaterial_id, description, derived_accession, clone_id, protocol_id, zygosity, cell_type): + self.biomaterial_id = biomaterial_id + self.description = description + self.derived_accession = derived_accession + self.clone_id = clone_id + self.protocol_id = protocol_id + self.zygosity = zygosity + self.cell_type = cell_type + self.differentiated_cell_lines = [] + + def add_differentiated_cell_line(self, differentiated_cell_line): + self.differentiated_cell_lines.append(differentiated_cell_line) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "biomaterial_id": self.biomaterial_id, + "description": self.description, + "derived_accession": self.derived_accession, + "clone_id": self.clone_id, + "protocol_id": self.protocol_id, + "zygosity": self.zygosity, + "cell_type": self.cell_type + } + } + + +class DifferentiatedCellLine: + def __init__(self, biomaterial_id, description, input_biomaterial_id, protocol_id, timepoint_value, timepoint_unit, + terminally_differentiated, model_system): + self.biomaterial_id = biomaterial_id + self.description = description + self.input_biomaterial_id = input_biomaterial_id + self.protocol_id = protocol_id + self.timepoint_value = timepoint_value + self.timepoint_unit = timepoint_unit + self.terminally_differentiated = terminally_differentiated + self.model_system = model_system + self.library_preparations = [] + + def add_library_preparation(self, library_preparation): + self.library_preparations.append(library_preparation) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "biomaterial_id": self.biomaterial_id, + "description": self.description, + "input_biomaterial_id": self.input_biomaterial_id, + "protocol_id": self.protocol_id, + "timepoint_value": self.timepoint_value, + "timepoint_unit": self.timepoint_unit, + "terminally_differentiated": self.terminally_differentiated, + "model_system": self.model_system + } + } + + +class LibraryPreparation: + def __init__(self, biomaterial_id, protocol_id, dissociation_protocol_id, differentiated_biomaterial_id, + average_fragment_size, input_amount_value, input_amount_unit, + final_yield_value, final_yield_unit, concentration_value, concentration_unit, + pcr_cycles, pcr_cycles_for_sample_index): + self.biomaterial_id = biomaterial_id + self.protocol_id = protocol_id + self.dissociation_protocol_id = dissociation_protocol_id + self.differentiated_biomaterial_id = differentiated_biomaterial_id + self.average_fragment_size = average_fragment_size + self.input_amount_value = input_amount_value + self.input_amount_unit = input_amount_unit + self.final_yield_value = final_yield_value + self.final_yield_unit = final_yield_unit + self.concentration_value = concentration_value + self.concentration_unit = concentration_unit + self.pcr_cycles = pcr_cycles + self.pcr_cycles_for_sample_index = pcr_cycles_for_sample_index + self.sequencing_files = [] + + def add_sequencing_file(self, sequencing_file): + self.sequencing_files.append(sequencing_file) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "biomaterial_id": self.biomaterial_id, + "protocol_id": self.protocol_id, + "dissociation_protocol_id": self.dissociation_protocol_id, + "differentiated_biomaterial_id": self.differentiated_biomaterial_id, + "average_fragment_size": self.average_fragment_size, + "input_amount_value": self.input_amount_value, + "input_amount_unit": self.input_amount_unit, + "final_yield_value": self.final_yield_value, + "final_yield_unit": self.final_yield_unit, + "concentration_value": self.concentration_value, + "concentration_unit": self.concentration_unit, + "pcr_cycles": self.pcr_cycles, + "pcr_cycles_for_sample_index": self.pcr_cycles_for_sample_index + } + } + + +class SequencingFile: + def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, read_index, run_id): + self.file_name = file_name + self.library_preparation_id = library_preparation_id + self.sequencing_protocol_id = sequencing_protocol_id + self.read_index = read_index + self.run_id = run_id + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "file_name": self.file_name, + "library_preparation_id": self.library_preparation_id, + "sequencing_protocol_id": self.sequencing_protocol_id, + "read_index": self.read_index, + "run_id": self.run_id + } + } + + +class SpreadsheetSubmitter: + """ + A class for parsing and processing data from an Excel spreadsheet containing information about + cell lines, differentiated cell lines, library preparations, and sequencing files. + + Attributes: + ---------- + file_path : str + The file path to the Excel spreadsheet. + + Methods: + ------- + list_sheets() + Retrieves the names of all sheets present in the Excel file. + + parse_cell_lines(sheet_name, column_mapping) + Parses data related to cell lines from a specified sheet in the Excel file. + + parse_differentiated_cell_lines(sheet_name, column_mapping) + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + parse_library_preparations(sheet_name, column_mapping) + Parses data related to library preparations from a specified sheet in the Excel file. + + parse_sequencing_files(sheet_name, column_mapping) + Parses data related to sequencing files from a specified sheet in the Excel file. + + get_cell_lines(sheet_name, column_mapping) + Retrieves parsed cell lines data from a specified sheet in the Excel file. + + get_differentiated_cell_lines(sheet_name, column_mapping) + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) + Merges cell lines and differentiated cell lines based on their biomaterial IDs. + + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) + Merges differentiated cell lines and library preparations based on their biomaterial IDs. + + merge_library_preparation_sequencing_file(library_preparations, sequencing_files) + Merges library preparations and sequencing files based on their IDs. + + get_library_preparations(sheet_name, column_mapping) + Retrieves parsed library preparations data from a specified sheet in the Excel file. + + get_sequencing_files(sheet_name, column_mapping) + Retrieves parsed sequencing files data from a specified sheet in the Excel file. + """ + + def __init__(self, file_path): + """ + Initializes a SpreadsheetSubmitter instance with the given file path. + + Parameters: + ----------- + file_path : str + The file path to the Excel spreadsheet. + """ + self.file_path = file_path + + def list_sheets(self): + """ + Retrieves the names of all sheets present in the Excel file. + + Returns: + -------- + list + A list of sheet names present in the Excel file. + """ + xls = pd.ExcelFile(self.file_path, engine='openpyxl') + return xls.sheet_names + + def parse_cell_lines(self, sheet_name, column_mapping): + """ + Parses data related to cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of CellLine objects parsed from the specified sheet. + """ + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) + + # Check if the required column exists + if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: + raise KeyError("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist.") + + # Filter rows where biomaterial_id is not null + df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] + + # Filter column_mapping to include only keys that exist in df.columns + columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if + col_mapping_val in df.columns] + + if not columns_to_select: + raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") + + # Select only columns that are present in df + df = df[columns_to_select] + + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] + + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + + # Apply the mask to filter out rows + df_filtered = df[mask] + + # Check for mandatory fields and create CellLine objects + cell_lines = [] + for _, row in df_filtered.iterrows(): + biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] + derived_accession = row.get('cell_line.derived_cell_line_accession') + cell_type = row.get('cell_line.type') + + # Check if biomaterial_id is null + if pd.isnull(biomaterial_id): + raise MissingMandatoryFieldError("Biomaterial ID cannot be null.") + + # Check if derived_accession and cell_type are present + if pd.isnull(derived_accession) or pd.isnull(cell_type): + raise MissingMandatoryFieldError( + "Mandatory fields (derived_accession, cell_type) are required. " + biomaterial_id) + + cell_lines.append( + CellLine( + biomaterial_id=biomaterial_id, + description=row.get('cell_line.biomaterial_core.biomaterial_description'), + derived_accession=derived_accession, + clone_id=row.get('cell_line.clone_id'), + protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), + zygosity=row.get('cell_line.zygosity'), + cell_type=cell_type + ) + ) + + return cell_lines + + def parse_differentiated_cell_lines(self, sheet_name, column_mapping): + """ + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) + + # Check if the required column exists + if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: + raise KeyError("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not exist.") + + # Filter rows where biomaterial_id is not null + df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + + # Filter column_mapping to include only keys that exist in df.columns + columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if + col_mapping_val in df.columns] + + if not columns_to_select: + raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") + + # Select only columns that are present in df + df = df[columns_to_select] + + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] + + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + + # Apply the mask to filter out rows + df_filtered = df[mask] + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + differentiated_cell_lines = [ + DifferentiatedCellLine( + biomaterial_id=row['differentiated_cell_line.biomaterial_core.biomaterial_id'], + description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), + input_biomaterial_id=row.get('cell_line.biomaterial_core.biomaterial_id'), + protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), + timepoint_value=row.get('differentiated_cell_line.timepoint_value'), + timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), + terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), + model_system=row.get('differentiated_cell_line.model_organ.text') + ) + for _, row in df_filtered.iterrows() + ] + + return differentiated_cell_lines + + def parse_library_preparations(self, sheet_name, column_mapping): + """ + Parses data related to library preparations from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing library preparation data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of LibraryPreparation objects parsed from the specified sheet. + """ + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) + + # Check if the required column exists + if 'library_preparation.biomaterial_core.biomaterial_id' not in df.columns: + raise KeyError("The column 'library_preparation.biomaterial_core.biomaterial_id' " + "does not exist.") + + # Filter rows where biomaterial_id is not null + df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] + + # Filter column_mapping to include only keys that exist in df.columns + columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if + col_mapping_val in df.columns] + + if not columns_to_select: + raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") + + # Select only columns that are present in df + df = df[columns_to_select] + + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['library_preparation.biomaterial_core.biomaterial_id'] + + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'library_preparation.biomaterial_core.biomaterial_id'))).all(axis=1) + + # Apply the mask to filter out rows + df_filtered = df[mask] + + # Create LibraryPreparation objects from filtered DataFrame rows + library_preparations = [ + LibraryPreparation( + biomaterial_id=row['library_preparation.biomaterial_core.biomaterial_id'], + protocol_id=row.get('library_preparation_protocol.protocol_core.protocol_id'), + dissociation_protocol_id=row.get('dissociation_protocol.protocol_core.protocol_id'), + differentiated_biomaterial_id=row.get('differentiated_cell_line.biomaterial_core.biomaterial_id'), + average_fragment_size=row.get('library_preparation.average_fragment_size'), + input_amount_value=row.get('library_preparation.input_amount_value'), + input_amount_unit=row.get('library_preparation.input_amount_unit'), + final_yield_value=row.get('library_preparation.final_yield_value'), + final_yield_unit=row.get('library_preparation.final_yield_unit'), + concentration_value=row.get('library_preparation.concentration_value'), + concentration_unit=row.get('library_preparation.concentration_unit'), + pcr_cycles=row.get('library_preparation.pcr_cycles'), + pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index') + ) + for _, row in df_filtered.iterrows() + ] + + return library_preparations + + def parse_sequencing_files(self, sheet_name, column_mapping): + """ + Parses data related to sequencing files from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing sequencing file data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of SequencingFile objects parsed from the specified sheet. + """ + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) + + # Check if the required column exists + if 'sequence_file.file_core.file_name' not in df.columns: + raise KeyError("The column 'sequence_file.file_core.file_name' does not exist.") + + # Filter rows where file_name is not null + df = df[df['sequence_file.file_core.file_name'].notna()] + + # Select only columns that are not None in the column_mapping + df = df[[col for col in column_mapping.values() if col is not None]] + + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['sequence_file.file_core.file_name'] + + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'The name of the file.', + 'Include the file extension in the file name. For example: R1.fastq.gz; codebook.json', + 'sequence_file.file_core.file_name'))).all(axis=1) + + # Apply the mask to filter out rows + df_filtered = df[mask] + + # Create SequencingFile objects from filtered DataFrame rows + sequencing_files = [ + SequencingFile( + file_name=row['sequence_file.file_core.file_name'], + library_preparation_id=row.get('library_preparation.biomaterial_core.biomaterial_id'), + sequencing_protocol_id=row.get('sequencing_protocol.protocol_core.protocol_id'), + read_index=row.get('sequence_file.read_index'), + run_id=row.get('sequence_file.run_id') + ) + for _, row in df_filtered.iterrows() + ] + + return sequencing_files + + def get_cell_lines(self, sheet_name, column_mapping): + """ + Retrieves parsed cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of CellLine objects parsed from the specified sheet. + """ + cell_lines = self.parse_cell_lines(sheet_name, column_mapping) + return cell_lines + + def get_differentiated_cell_lines(self, sheet_name, column_mapping): + """ + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + differentiated_cell_lines = self.parse_differentiated_cell_lines(sheet_name, column_mapping) + return differentiated_cell_lines + + def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): + """ + Merges cell lines and differentiated cell lines based on their biomaterial IDs. + + Parameters: + ----------- + cell_lines : list + A list of CellLine objects to be merged. + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a differentiated cell line does not have a corresponding cell line. + """ + cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: + raise MissingEntityError("Cell Line", + "Differentiated cell line", + differentiated_cell_line.biomaterial_id) + + for cell_line in cell_lines: + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.input_biomaterial_id == cell_line.biomaterial_id: + cell_line.add_differentiated_cell_line(differentiated_cell_line) + + def merge_differentiated_cell_line_and_library_preparation(self, differentiated_cell_lines, library_preparations): + """ + Merges differentiated cell lines and library preparations based on their biomaterial IDs. + + Parameters: + ----------- + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + library_preparations : list + A list of LibraryPreparation objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a library preparation does not have a corresponding differentiated cell line. + """ + differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id not in differentiated_ids: + raise MissingEntityError("Differentiated Cell Line", + "Library preparation", + library_preparation.biomaterial_id) + + for differentiated_cell_line in differentiated_cell_lines: + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id == differentiated_cell_line.biomaterial_id: + differentiated_cell_line.add_library_preparation(library_preparation) + + def merge_library_preparation_sequencing_file(self, library_preparations, sequencing_files): + """ + Merges library preparations and sequencing files based on their IDs. + + Parameters: + ----------- + library_preparations : list + A list of LibraryPreparation objects to be merged. + sequencing_files : list + A list of SequencingFile objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a sequencing file does not have a corresponding library preparation. + """ + library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id not in library_ids: + raise MissingEntityError("Library preparation", + "Sequencing file", + sequencing_file.file_name) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: + library_preparation.add_sequencing_file(sequencing_file) + + def get_library_preparations(self, sheet_name, column_mapping): + """ + Retrieves parsed library preparations data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing library preparation data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of LibraryPreparation objects parsed from the specified sheet. + """ + library_preparations = self.parse_library_preparations(sheet_name, column_mapping) + return library_preparations + + def get_sequencing_files(self, sheet_name, column_mapping): + """ + Retrieves parsed sequencing files data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing sequencing file data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of SequencingFile objects parsed from the specified sheet. + """ + sequencing_files = self.parse_sequencing_files(sheet_name, column_mapping) + return sequencing_files diff --git a/requirements.txt b/requirements.txt index c4c9135..0128bf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ requests>=2.20.0, <3 urllib3<1.27, >=1.25.4 tqdm~=4.64.1 pandas~=1.1.5 -setuptools~=59.6.0 \ No newline at end of file +setuptools~=59.6.0 +openpyxl==3.1.3 \ No newline at end of file From 1c927db716d19653896c03c7d12feb1fd0cc1283 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 19 Jun 2024 16:26:08 +0100 Subject: [PATCH 13/55] metadata spreadsheet submission support added-added library prep support --- ait/commons/util/command/submit.py | 29 ++++++++++++++++++++++- ait/commons/util/util/spreadsheet_util.py | 19 ++++++++++----- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 453001d..f19c591 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -96,7 +96,7 @@ def multi_type_submission(self, cell_lines, submission_envelope_id, access_token print("Cell line has differentiated cell lines, creating process to link them") differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity('process', - cell_line.to_dict(), + {}, submission_envelope_id, access_token) @@ -116,6 +116,33 @@ def multi_type_submission(self, cell_lines, submission_envelope_id, access_token differentiated_biomaterial_to_entity_id_map[ differentiated_cell_line.biomaterial_id] = differentiated_entity_id + if len(differentiated_cell_line.library_preparations) > 0: + # Create differentiation process + print("Differentiated cell line has library preparation biomaterials, creating process to " + "link them") + + library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity('process', + cell_line.to_dict(), + submission_envelope_id, + access_token) + + # Create a dictionary to store biomaterial_id to entity_id mappings for library preparations + library_preparation_biomaterial_to_entity_id_map = {} + + for library_preparation in differentiated_cell_line.library_preparations: + print( + f"Creating Library preparation Biomaterial: {library_preparation.biomaterial_id}") + + # Create library_prep biomaterial + library_preparation_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', + library_preparation.to_dict(), + submission_envelope_id, + access_token) + + # Update the mapping dictionary + library_preparation_biomaterial_to_entity_id_map[ + library_preparation.biomaterial_id] = library_preparation_entity_id + def typed_submission(self, type, file, access_token): """ Submits a single entity based on its type. diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 13f1a3c..1334a00 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -1,5 +1,6 @@ import pandas as pd import json +import numpy as np class MissingMandatoryFieldError(Exception): @@ -12,7 +13,7 @@ class MissingEntityError(Exception): """Custom exception raised when an expected entity is missing.""" def __init__(self, missing_type, entity_type, missing_id): - super().__init__(f"Missing {missing_type} for {entity_type} and Id is {missing_id}") + super().__init__(f"Missing {missing_type} for {entity_type} and ID is {missing_id}") self.entity_type = entity_type self.missing_type = missing_type self.missing_id = missing_id @@ -110,21 +111,27 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): + # Replace NaN values with None + def convert_nan_to_none(obj): + if isinstance(obj, float) and np.isnan(obj): + return None + return obj + return { "content": { "biomaterial_id": self.biomaterial_id, "protocol_id": self.protocol_id, "dissociation_protocol_id": self.dissociation_protocol_id, "differentiated_biomaterial_id": self.differentiated_biomaterial_id, - "average_fragment_size": self.average_fragment_size, - "input_amount_value": self.input_amount_value, + "average_fragment_size": convert_nan_to_none(self.average_fragment_size), + "input_amount_value": convert_nan_to_none(self.input_amount_value), "input_amount_unit": self.input_amount_unit, - "final_yield_value": self.final_yield_value, + "final_yield_value": convert_nan_to_none(self.final_yield_value), "final_yield_unit": self.final_yield_unit, - "concentration_value": self.concentration_value, + "concentration_value": convert_nan_to_none(self.concentration_value), "concentration_unit": self.concentration_unit, "pcr_cycles": self.pcr_cycles, - "pcr_cycles_for_sample_index": self.pcr_cycles_for_sample_index + "pcr_cycles_for_sample_index": convert_nan_to_none(self.pcr_cycles_for_sample_index) } } From 34657ee9910a7833ef75dd278b1f02041642e130 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 20 Jun 2024 13:30:06 +0100 Subject: [PATCH 14/55] metadata spreadsheet submission support - linkage and output file writing support --- ait/commons/util/command/submit.py | 280 ++++++++++++++++------ ait/commons/util/command/submit_file.py | 33 ++- ait/commons/util/util/spreadsheet_util.py | 46 ++-- 3 files changed, 266 insertions(+), 93 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index f19c591..f54422f 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -2,6 +2,7 @@ import requests import json import pandas as pd +import numpy as np from urllib.parse import urlparse from ait.commons.util.user_profile import get_profile @@ -73,75 +74,179 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def multi_type_submission(self, cell_lines, submission_envelope_id, access_token): - """ - Submits multiple cell lines. + def submit_cell_line(self, cell_line, submission_envelope_id, access_token): + cell_line_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + cell_line.to_dict(), + submission_envelope_id, + access_token + ) + return cell_line_entity_id + + def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, differentiated_cell_lines_df, + library_preparations_df, + submission_envelope_id, access_token): + if len(cell_line.differentiated_cell_lines) > 0: + print("Cell line has differentiated cell lines, creating process to link them") + differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', {}, submission_envelope_id, access_token + ) + + differentiated_biomaterial_to_entity_id_map = {} + differentiated_cell_line_entity_id_column_name = "differentiated_cell_line_entity_id" + + if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: + differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan + + for differentiated_cell_line in cell_line.differentiated_cell_lines: + print(f"Creating Differentiated Cell Line Biomaterial: " + f"{differentiated_cell_line.biomaterial_id} as a child of Cell line: " + f"{cell_line_entity_id}") + + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) + + print(f"Created Differentiated Cell Line Biomaterial: " + f"{differentiated_entity_id}") + + print(f"Linking Differentiated Cell Line Biomaterial: " + f"{differentiated_entity_id} to envelope: " + f"{submission_envelope_id}") + + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) + + print(f"Linking Cell Line Biomaterial: " + f"{cell_line_entity_id} as input to process : " + f"{differentiation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Differentiated cell line Biomaterial: " + f"{differentiated_entity_id} as derived by process : " + f"{differentiation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + differentiated_biomaterial_to_entity_id_map[ + differentiated_cell_line.biomaterial_id] = differentiated_entity_id + + differentiated_cell_lines_df.loc[ + differentiated_cell_lines_df[ + 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == differentiated_cell_line.biomaterial_id, + differentiated_cell_line_entity_id_column_name + ] = differentiated_entity_id + + self.handle_library_preparations(differentiated_cell_line, + differentiated_entity_id, + library_preparations_df, + submission_envelope_id, + access_token) + + def handle_library_preparations(self, differentiated_cell_line, differentiated_entity_id, + library_preparations_df, + submission_envelope_id, access_token): + if len(differentiated_cell_line.library_preparations) > 0: + print("Differentiated cell line has library preparation biomaterials, creating process to link them") + library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', {}, submission_envelope_id, access_token + ) + + library_preparation_biomaterial_to_entity_id_map = {} + library_preparation_biomaterial_entity_id_column_name = " library_preparation_biomaterial_entity_id" + + if library_preparation_biomaterial_entity_id_column_name not in library_preparations_df.columns: + library_preparations_df[library_preparation_biomaterial_entity_id_column_name] = np.nan + + for library_preparation in differentiated_cell_line.library_preparations: + print(f"Creating Library Preparation Biomaterial: " + f"{library_preparation.biomaterial_id} as a child of Differentiated Cell line: " + f"{differentiated_entity_id}") + + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) + + print(f"Created Library preparation Biomaterial: " + f"{library_preparation_entity_id}") + + print(f"Linking Library preparation Biomaterial: " + f"{library_preparation_entity_id} to envelope: " + f"{submission_envelope_id}") + + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + + print(f"Linking Differentiation cell line Biomaterial: " + f"{differentiated_entity_id} as input to process : " + f"{library_preparation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Library preparation Biomaterial: " + f"{library_preparation_entity_id} as derived by process : " + f"{library_preparation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + library_preparation_biomaterial_to_entity_id_map[ + library_preparation.biomaterial_id] = library_preparation_entity_id + + library_preparations_df.loc[ + library_preparations_df[ + 'library_preparation.biomaterial_core.biomaterial_id'] == library_preparation.biomaterial_id, + library_preparation_biomaterial_entity_id_column_name + ] = library_preparation_entity_id + + def multi_type_submission(self, cell_lines, cell_lines_df, + differentiated_cell_lines_df, + library_preparations_df, + submission_envelope_id, + access_token): + cell_line_entity_id_column_name = "cell_line_entity_id" + + if cell_line_entity_id_column_name not in cell_lines_df.columns: + cell_lines_df[cell_line_entity_id_column_name] = np.nan - Parameters: - cell_lines (list): List of cell line objects to be submitted. - submission_envelope_id (str): ID of the submission envelope. - access_token (str): Access token for authorization. - """ for cell_line in cell_lines: print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") - # Create cell line biomaterial - cell_line_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', - cell_line.to_dict(), - submission_envelope_id, - access_token) - - if len(cell_line.differentiated_cell_lines) > 0: - # Create differentiation process - print("Cell line has differentiated cell lines, creating process to link them") - - differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity('process', - {}, - submission_envelope_id, - access_token) - - # Create a dictionary to store biomaterial_id to entity_id mappings for differentiated cell lines - differentiated_biomaterial_to_entity_id_map = {} - - for differentiated_cell_line in cell_line.differentiated_cell_lines: - print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id}") - - # Create differentiated cell line biomaterial - differentiated_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', - differentiated_cell_line.to_dict(), - submission_envelope_id, - access_token) - - # Update the mapping dictionary - differentiated_biomaterial_to_entity_id_map[ - differentiated_cell_line.biomaterial_id] = differentiated_entity_id + cell_line_entity_id = self.submit_cell_line(cell_line, submission_envelope_id, access_token) + cell_lines_df.loc[ + cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == cell_line.biomaterial_id, + cell_line_entity_id_column_name + ] = cell_line_entity_id - if len(differentiated_cell_line.library_preparations) > 0: - # Create differentiation process - print("Differentiated cell line has library preparation biomaterials, creating process to " - "link them") + self.handle_differentiated_cell_lines(cell_line, cell_line_entity_id, differentiated_cell_lines_df, + library_preparations_df, + submission_envelope_id, access_token) - library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity('process', - cell_line.to_dict(), - submission_envelope_id, - access_token) - - # Create a dictionary to store biomaterial_id to entity_id mappings for library preparations - library_preparation_biomaterial_to_entity_id_map = {} - - for library_preparation in differentiated_cell_line.library_preparations: - print( - f"Creating Library preparation Biomaterial: {library_preparation.biomaterial_id}") - - # Create library_prep biomaterial - library_preparation_entity_id = self.use_existing_envelope_and_submit_entity('biomaterial', - library_preparation.to_dict(), - submission_envelope_id, - access_token) - - # Update the mapping dictionary - library_preparation_biomaterial_to_entity_id_map[ - library_preparation.biomaterial_id] = library_preparation_entity_id + return cell_lines_df, differentiated_cell_lines_df, library_preparations_df def typed_submission(self, type, file, access_token): """ @@ -300,7 +405,7 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): print("Linking biomaterial " + biomaterial_id + " to process " + process_id) self.perform_hal_linkage(f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", - process_id, 'biomaterials', 'processes', access_token) + process_id, 'processes', access_token) def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): """ @@ -326,6 +431,30 @@ def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): return url + def post_to_provider_api_and_get_entity_id(self, url, data, access_token): + """ + Sends a POST request to the specified URL. + + Parameters: + url (str): The URL to send the request to. + data_type_in_hal_link (str): The data type in the HAL link. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. + + Returns: + str: The URL from the response. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + url = response_data['_links']['self']['href'] + + return get_id_from_url(url) + def create_new_submission_envelope(self, url, access_token): """ Creates a new submission envelope. @@ -347,7 +476,7 @@ def create_new_submission_envelope(self, url, access_token): return response_data - def perform_hal_linkage(self, url, input_id, link_this, link_to, access_token): + def perform_hal_linkage(self, url, input_id, link_to, access_token): """ Performs HAL linkage. @@ -367,9 +496,13 @@ def perform_hal_linkage(self, url, input_id, link_this, link_to, access_token): } response = requests.post(url, headers=headers, - data=f"{self.base_url}/{link_to}/{input_id}/{link_this}") + data=f"{self.base_url}/{link_to}/{input_id}") - return response.json() + if response.status_code != 200: + raise Exception(f"Failed to link biomaterial to process {input_id}. " + f"Status code: {response.status_code}, Response: {response.text}") + else: + print("linkage successful") def transform(self, file): """ @@ -417,3 +550,16 @@ def put_to_provider_api(self, url, access_token): response_data = response.json() return response_data + + def create_child_biomaterial(self, cell_line_entity_id, body, access_token): + url = self.base_url + '/' + 'biomaterials' + '/' + cell_line_entity_id + '/' + 'childBiomaterials' + + entity_id = self.post_to_provider_api_and_get_entity_id(url, body, access_token) + return entity_id + + def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, access_token): + # TODO: handle other types + if type == 'biomaterial': + url = self.submission_envelope_base_url + '/' + submission_envelope_id + '/' + 'biomaterials' + '/' + entity_id + + self.put_to_provider_api(url, access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 2f7a2a0..bb24e4b 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,4 +1,6 @@ # Import necessary modules/classes from ait.commons.util package +import pandas as pd + from ait.commons.util.command.submit import CmdSubmit, get_id_from_url from ait.commons.util.user_profile import get_profile from ait.commons.util.util.spreadsheet_util import SpreadsheetSubmitter @@ -88,13 +90,18 @@ def run(self): # Initialize SpreadsheetParser with the provided file path parser = SpreadsheetSubmitter(self.file) + # print(parser.list_sheets()) + # Parse different sections of the spreadsheet using defined column mappings - cell_lines = parser.get_cell_lines('Cell line ', self.cellline_column_mapping) - differentiated_cell_lines = parser.get_differentiated_cell_lines('Differentiated cell line', - self.differentiated_cellline_column_mapping) + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', + self.cellline_column_mapping) + + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + 'Differentiated cell line', + self.differentiated_cellline_column_mapping) parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) - library_preparations = parser.get_library_preparations('Library preparation', - self.library_preparation_column_mapping) + library_preparations, library_preparations_df = parser.get_library_preparations('Library preparation', + self.library_preparation_column_mapping) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) sequencing_files = parser.get_sequencing_files('Sequence file', self.sequencing_file_column_mapping) @@ -109,4 +116,18 @@ def run(self): print("Submission envelope for this submission is: " + submission_envelope_id) - submission_instance.multi_type_submission(cell_lines, submission_envelope_id, self.access_token) + # Perform the submission and get the updated dataframes + updated_cell_lines_df, updated_differentiated_cell_lines_df, updated_library_preparations_df = submission_instance.multi_type_submission( + cell_lines, + cell_lines_df, + differentiated_cell_lines_df, + library_preparations_df, + submission_envelope_id, + self.access_token + ) + + # Save both dataframes to a single Excel file with multiple sheets + with pd.ExcelWriter("updated_cell_lines.xlsx") as writer: + updated_cell_lines_df.to_excel(writer, sheet_name='CellLines', index=False) + updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='DifferentiatedCellLines', index=False) + updated_library_preparations_df.to_excel(writer, sheet_name='Library Preparations', index=False) diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 1334a00..d07235b 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -111,10 +111,11 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): - # Replace NaN values with None - def convert_nan_to_none(obj): - if isinstance(obj, float) and np.isnan(obj): - return None + # Replace NaN values and out-of-range float values with None + def convert_to_valid_json_value(obj): + if isinstance(obj, float): + if np.isnan(obj) or not np.isfinite(obj): + return None return obj return { @@ -123,15 +124,15 @@ def convert_nan_to_none(obj): "protocol_id": self.protocol_id, "dissociation_protocol_id": self.dissociation_protocol_id, "differentiated_biomaterial_id": self.differentiated_biomaterial_id, - "average_fragment_size": convert_nan_to_none(self.average_fragment_size), - "input_amount_value": convert_nan_to_none(self.input_amount_value), + "average_fragment_size": convert_to_valid_json_value(self.average_fragment_size), + "input_amount_value": convert_to_valid_json_value(self.input_amount_value), "input_amount_unit": self.input_amount_unit, - "final_yield_value": convert_nan_to_none(self.final_yield_value), + "final_yield_value": convert_to_valid_json_value(self.final_yield_value), "final_yield_unit": self.final_yield_unit, - "concentration_value": convert_nan_to_none(self.concentration_value), + "concentration_value": convert_to_valid_json_value(self.concentration_value), "concentration_unit": self.concentration_unit, "pcr_cycles": self.pcr_cycles, - "pcr_cycles_for_sample_index": convert_nan_to_none(self.pcr_cycles_for_sample_index) + "pcr_cycles_for_sample_index": convert_to_valid_json_value(self.pcr_cycles_for_sample_index) } } @@ -244,8 +245,10 @@ def parse_cell_lines(self, sheet_name, column_mapping): Returns: -------- - list - A list of CellLine objects parsed from the specified sheet. + tuple + A tuple containing: + - list of CellLine objects parsed from the specified sheet. + - pd.DataFrame with the parsed data. """ df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') df.columns = df.columns.str.strip() @@ -280,6 +283,7 @@ def parse_cell_lines(self, sheet_name, column_mapping): df_filtered = df[mask] # Check for mandatory fields and create CellLine objects + # TODO: for all cell_lines = [] for _, row in df_filtered.iterrows(): biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] @@ -307,7 +311,7 @@ def parse_cell_lines(self, sheet_name, column_mapping): ) ) - return cell_lines + return cell_lines, df_filtered def parse_differentiated_cell_lines(self, sheet_name, column_mapping): """ @@ -372,7 +376,7 @@ def parse_differentiated_cell_lines(self, sheet_name, column_mapping): for _, row in df_filtered.iterrows() ] - return differentiated_cell_lines + return differentiated_cell_lines, df_filtered def parse_library_preparations(self, sheet_name, column_mapping): """ @@ -401,6 +405,7 @@ def parse_library_preparations(self, sheet_name, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] + df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Filter column_mapping to include only keys that exist in df.columns columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if @@ -443,7 +448,7 @@ def parse_library_preparations(self, sheet_name, column_mapping): for _, row in df_filtered.iterrows() ] - return library_preparations + return library_preparations, df_filtered def parse_sequencing_files(self, sheet_name, column_mapping): """ @@ -517,8 +522,8 @@ def get_cell_lines(self, sheet_name, column_mapping): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines = self.parse_cell_lines(sheet_name, column_mapping) - return cell_lines + cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, column_mapping) + return cell_lines, cell_lines_df def get_differentiated_cell_lines(self, sheet_name, column_mapping): """ @@ -536,8 +541,9 @@ def get_differentiated_cell_lines(self, sheet_name, column_mapping): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - differentiated_cell_lines = self.parse_differentiated_cell_lines(sheet_name, column_mapping) - return differentiated_cell_lines + differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name, + column_mapping) + return differentiated_cell_lines, differentiated_cell_lines_df def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): """ @@ -651,8 +657,8 @@ def get_library_preparations(self, sheet_name, column_mapping): list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations = self.parse_library_preparations(sheet_name, column_mapping) - return library_preparations + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, column_mapping) + return library_preparations, df_filtered def get_sequencing_files(self, sheet_name, column_mapping): """ From 542b3de1fa8c60687fc9d1ba3445d53c617c1eaa Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 20 Jun 2024 16:35:54 +0100 Subject: [PATCH 15/55] metadata spreadsheet submission support - linkage and sequencing file support --- ait/commons/util/command/submit.py | 116 ++++++++++++++++++++-- ait/commons/util/util/spreadsheet_util.py | 30 ++++-- 2 files changed, 132 insertions(+), 14 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index f54422f..e903ecf 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -75,6 +75,17 @@ def run(self): return self.typed_submission(self.type, self.file, self.access_token) def submit_cell_line(self, cell_line, submission_envelope_id, access_token): + """ + Submits a cell line as a biomaterial entity to a specified submission envelope. + + Parameters: + - cell_line: The cell line object to be submitted. + - submission_envelope_id: ID of the submission envelope where the entity will be submitted. + - access_token: Access token for authentication and authorization. + + Returns: + - cell_line_entity_id: Entity ID of the submitted cell line biomaterial. + """ cell_line_entity_id = self.use_existing_envelope_and_submit_entity( 'biomaterial', cell_line.to_dict(), @@ -84,8 +95,18 @@ def submit_cell_line(self, cell_line, submission_envelope_id, access_token): return cell_line_entity_id def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, - submission_envelope_id, access_token): + library_preparations_df, submission_envelope_id, access_token): + """ + Handles differentiated cell lines associated with a given cell line. + + Parameters: + - cell_line: The main cell line object. + - cell_line_entity_id: Entity ID of the main cell line already submitted. + - differentiated_cell_lines_df: DataFrame containing information about differentiated cell lines. + - library_preparations_df: DataFrame containing information about library preparations. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ if len(cell_line.differentiated_cell_lines) > 0: print("Cell line has differentiated cell lines, creating process to link them") differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -157,8 +178,17 @@ def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, diffe access_token) def handle_library_preparations(self, differentiated_cell_line, differentiated_entity_id, - library_preparations_df, - submission_envelope_id, access_token): + library_preparations_df, submission_envelope_id, access_token): + """ + Handles library preparations associated with a differentiated cell line. + + Parameters: + - differentiated_cell_line: The differentiated cell line object. + - differentiated_entity_id: Entity ID of the differentiated cell line. + - library_preparations_df: DataFrame containing information about library preparations. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ if len(differentiated_cell_line.library_preparations) > 0: print("Differentiated cell line has library preparation biomaterials, creating process to link them") library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -223,11 +253,78 @@ def handle_library_preparations(self, differentiated_cell_line, differentiated_e library_preparation_biomaterial_entity_id_column_name ] = library_preparation_entity_id + self.handle_sequence_files(library_preparation, + library_preparation_entity_id, + submission_envelope_id, + access_token) + + def handle_sequence_files(self, library_preparation, library_preparation_entity_id, submission_envelope_id, + access_token): + if len(library_preparation.sequencing_files) > 0: + print("library preparation has sequencing files, creating process to link them") + sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', {}, submission_envelope_id, access_token + ) + + sequencing_file_to_entity_id_map = {} + + for sequencing_file in library_preparation.sequencing_files: + print(f"Creating Sequencing file: " + f"{sequencing_file.file_name} as a result of sequencing the " + f"Library preparation biomaterial: " + f"{library_preparation_entity_id}") + + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) + + print(f"Created Sequencing file: " + f"{sequencing_file_entity_id}") + + print(f"Linking Library preparation Biomaterial: " + f"{library_preparation_entity_id} as input to process : " + f"{sequencing_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + print(f"Linking Sequencing file: " + f"{sequencing_file_entity_id} as derived by process : " + f"{sequencing_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + sequencing_file_to_entity_id_map[ + sequencing_file.file_name] = sequencing_file_entity_id + def multi_type_submission(self, cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, submission_envelope_id, access_token): + """ + Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) + to a specified submission envelope. + + Parameters: + - cell_lines: List of cell line objects to be submitted. + - cell_lines_df: DataFrame for tracking cell line entity IDs. + - differentiated_cell_lines_df: DataFrame for tracking differentiated cell line entity IDs. + - library_preparations_df: DataFrame for tracking library preparation entity IDs. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + + Returns: + - Updated versions of cell_lines_df, differentiated_cell_lines_df, and library_preparations_df after submission. + """ cell_line_entity_id_column_name = "cell_line_entity_id" if cell_line_entity_id_column_name not in cell_lines_df.columns: @@ -243,8 +340,7 @@ def multi_type_submission(self, cell_lines, cell_lines_df, ] = cell_line_entity_id self.handle_differentiated_cell_lines(cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, - submission_envelope_id, access_token) + library_preparations_df, submission_envelope_id, access_token) return cell_lines_df, differentiated_cell_lines_df, library_preparations_df @@ -351,6 +447,8 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submi halEntity = 'biomaterials' elif input_entity_type == 'process': halEntity = 'processes' + elif input_entity_type == 'file': + halEntity = 'files' entity_create_url_from_sub_env_hal_links = (self.submission_envelope_base_url + "/" + submission_envelope_id @@ -559,7 +657,11 @@ def create_child_biomaterial(self, cell_line_entity_id, body, access_token): def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, access_token): # TODO: handle other types + global url + if type == 'biomaterial': url = self.submission_envelope_base_url + '/' + submission_envelope_id + '/' + 'biomaterials' + '/' + entity_id + elif type == 'file': + url = self.submission_envelope_base_url + '/' + submission_envelope_id + '/' + 'files' + '/' + entity_id - self.put_to_provider_api(url, access_token) + self.put_to_provider_api(url, access_token) diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index d07235b..6861579 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -137,6 +137,10 @@ def convert_to_valid_json_value(obj): } +class EntityType: + FILE = 'FILE' + + class SequencingFile: def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, read_index, run_id): self.file_name = file_name @@ -144,21 +148,33 @@ def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, re self.sequencing_protocol_id = sequencing_protocol_id self.read_index = read_index self.run_id = run_id + self.entity_type = EntityType.FILE + self.content = { + "file_name": self.file_name, + "library_preparation_id": self.library_preparation_id, + "sequencing_protocol_id": self.sequencing_protocol_id, + "read_index": self.read_index, + "run_id": self.run_id + } + self.set_file_name(file_name) + self.init_file() def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): return { - "content": { - "file_name": self.file_name, - "library_preparation_id": self.library_preparation_id, - "sequencing_protocol_id": self.sequencing_protocol_id, - "read_index": self.read_index, - "run_id": self.run_id - } + "content": self.content, + "fileName": self.file_name } + def set_file_name(self, file_name): + self.file_name = file_name + + def init_file(self): + # Placeholder for any initialization logic required for the file + pass + class SpreadsheetSubmitter: """ From 47b27344e062e82501067757b0a6255fcbca3138 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 20 Jun 2024 22:32:44 +0100 Subject: [PATCH 16/55] metadata spreadsheet submission support - sequencing file sheet updated with IDs --- ait/commons/util/command/submit.py | 27 ++++++++++++++++++----- ait/commons/util/command/submit_file.py | 8 +++++-- ait/commons/util/util/spreadsheet_util.py | 7 +++--- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index e903ecf..d82c135 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -95,7 +95,8 @@ def submit_cell_line(self, cell_line, submission_envelope_id, access_token): return cell_line_entity_id def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, submission_envelope_id, access_token): + library_preparations_df, sequencing_file_df, submission_envelope_id, + access_token): """ Handles differentiated cell lines associated with a given cell line. @@ -174,11 +175,12 @@ def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, diffe self.handle_library_preparations(differentiated_cell_line, differentiated_entity_id, library_preparations_df, + sequencing_file_df, submission_envelope_id, access_token) def handle_library_preparations(self, differentiated_cell_line, differentiated_entity_id, - library_preparations_df, submission_envelope_id, access_token): + library_preparations_df, sequencing_file_df, submission_envelope_id, access_token): """ Handles library preparations associated with a differentiated cell line. @@ -255,10 +257,13 @@ def handle_library_preparations(self, differentiated_cell_line, differentiated_e self.handle_sequence_files(library_preparation, library_preparation_entity_id, + sequencing_file_df, submission_envelope_id, access_token) - def handle_sequence_files(self, library_preparation, library_preparation_entity_id, submission_envelope_id, + def handle_sequence_files(self, library_preparation, library_preparation_entity_id, + sequencing_file_df, + submission_envelope_id, access_token): if len(library_preparation.sequencing_files) > 0: print("library preparation has sequencing files, creating process to link them") @@ -267,6 +272,10 @@ def handle_sequence_files(self, library_preparation, library_preparation_entity_ ) sequencing_file_to_entity_id_map = {} + sequencing_file_entity_id_column_name = "sequencing_file_entity_id" + + if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: + sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan for sequencing_file in library_preparation.sequencing_files: print(f"Creating Sequencing file: " @@ -302,12 +311,19 @@ def handle_sequence_files(self, library_preparation, library_preparation_entity_ sequencing_process_entity_id, 'processes', access_token ) + sequencing_file_df.loc[ + sequencing_file_df[ + 'sequence_file.file_core.file_name'] == sequencing_file.file_name, + sequencing_file_entity_id_column_name + ] = sequencing_file_entity_id + sequencing_file_to_entity_id_map[ sequencing_file.file_name] = sequencing_file_entity_id def multi_type_submission(self, cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, + sequencing_file_df, submission_envelope_id, access_token): """ @@ -340,9 +356,10 @@ def multi_type_submission(self, cell_lines, cell_lines_df, ] = cell_line_entity_id self.handle_differentiated_cell_lines(cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, submission_envelope_id, access_token) + library_preparations_df, sequencing_file_df, submission_envelope_id, + access_token) - return cell_lines_df, differentiated_cell_lines_df, library_preparations_df + return cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df def typed_submission(self, type, file, access_token): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index bb24e4b..80c85e7 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -104,7 +104,8 @@ def run(self): self.library_preparation_column_mapping) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) - sequencing_files = parser.get_sequencing_files('Sequence file', self.sequencing_file_column_mapping) + sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', + self.sequencing_file_column_mapping) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) # Print each CellLine object in CellLineMaster @@ -117,11 +118,13 @@ def run(self): print("Submission envelope for this submission is: " + submission_envelope_id) # Perform the submission and get the updated dataframes - updated_cell_lines_df, updated_differentiated_cell_lines_df, updated_library_preparations_df = submission_instance.multi_type_submission( + (updated_cell_lines_df, updated_differentiated_cell_lines_df, + updated_library_preparations_df, updated_sequencing_files_df) = submission_instance.multi_type_submission( cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, + sequencing_files_df, submission_envelope_id, self.access_token ) @@ -131,3 +134,4 @@ def run(self): updated_cell_lines_df.to_excel(writer, sheet_name='CellLines', index=False) updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='DifferentiatedCellLines', index=False) updated_library_preparations_df.to_excel(writer, sheet_name='Library Preparations', index=False) + updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence files', index=False) diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 6861579..3e61f3f 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -421,6 +421,7 @@ def parse_library_preparations(self, sheet_name, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] + # TODO: for all df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Filter column_mapping to include only keys that exist in df.columns @@ -520,7 +521,7 @@ def parse_sequencing_files(self, sheet_name, column_mapping): for _, row in df_filtered.iterrows() ] - return sequencing_files + return sequencing_files, df_filtered def get_cell_lines(self, sheet_name, column_mapping): """ @@ -692,5 +693,5 @@ def get_sequencing_files(self, sheet_name, column_mapping): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files = self.parse_sequencing_files(sheet_name, column_mapping) - return sequencing_files + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, column_mapping) + return sequencing_files, df_filtered From 93f34a297b761b029ff93f639c4b402d3fe53418 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 25 Jun 2024 11:37:58 +0100 Subject: [PATCH 17/55] several improvements including file validation --- ait/commons/util/__main__.py | 6 +- ait/commons/util/cmd.py | 4 +- ait/commons/util/command/list.py | 33 ++ ait/commons/util/command/submit.py | 559 ++++++++++++---------- ait/commons/util/command/submit_file.py | 163 ++++--- ait/commons/util/util/spreadsheet_util.py | 85 ++-- 6 files changed, 469 insertions(+), 381 deletions(-) diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 5e8a547..a065dec 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -86,8 +86,10 @@ def parse_args(args): parser_config.add_argument('--dataset', help='your dataset reference') parser_config.add_argument('--process', help='your process/analysis reference') - parser_config = cmd_parser.add_parser('submit-file', help='submit your file containing your consolidated metadata') - parser_config.add_argument('--file', help='your metadata') + parser_config = cmd_parser.add_parser('submit-file', help='submit your file containing your dataset metadata') + parser_config.add_argument('--file', help='spreadsheet containing your dataset metadata') + parser_config.add_argument('--action', help='action you want to perform (ADD/MODIFY/DELETE') + parser_config.add_argument('--dataset', help='your dataset reference') parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index 2df7fbc..f96e603 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -44,8 +44,8 @@ def __init__(self, args): print(msg) elif args.command == 'submit-file': - CmdSubmitFile(args).run() - # print(msg) + success, msg = CmdSubmitFile(args).run() + print(msg) else: if profile_exists(args.profile): diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index c3d0e7e..c28c7a7 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -60,6 +60,39 @@ def list_bucket_contents(self, selected_area, prefix=''): k = f.get('Key') print_area(k, {'key': k, 'perms': 'file'}) + def list_bucket_contents_and_return(self, selected_area, prefix=''): + """ + Lists the contents of an S3 bucket and returns a list of file keys. + + Parameters: + - selected_area: The S3 bucket name. + - prefix: The prefix to filter objects by (default is empty string, which lists all objects). + + Returns: + - A list of file keys in the bucket. + """ + file_keys = [] + + def _list_bucket_contents(bucket, prefix): + result = self.s3_cli.list_objects_v2(Bucket=bucket, Delimiter='/', Prefix=prefix) + + # Folders + dirs = result.get('CommonPrefixes', []) + for d in dirs: + k = d.get('Prefix') + # print_area(k, {'key': k, 'perms': 'dir'}) + _list_bucket_contents(bucket, prefix=k) + + # Files + files = result.get('Contents', []) + for f in files: + k = f.get('Key') + # print_area(k, {'key': k, 'perms': 'file'}) + file_keys.append(k) + + _list_bucket_contents(selected_area, prefix) + return file_keys + def print_count(count): if count == 0: diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index d82c135..1610280 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -1,4 +1,6 @@ import csv +import traceback + import requests import json import pandas as pd @@ -22,6 +24,16 @@ def get_id_from_url(url): return path_parts[2] +def get_process_content(name): + process_data = { + "content": { + "type": name + } + } + + return process_data + + class CmdSubmit: """ A class to handle submission of studies, datasets, and biomaterials to a server. @@ -74,251 +86,253 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def submit_cell_line(self, cell_line, submission_envelope_id, access_token): + def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, access_token): """ Submits a cell line as a biomaterial entity to a specified submission envelope. Parameters: - cell_line: The cell line object to be submitted. + - cell_lines_df: DataFrame containing information about cell lines. - submission_envelope_id: ID of the submission envelope where the entity will be submitted. - access_token: Access token for authentication and authorization. Returns: - cell_line_entity_id: Entity ID of the submitted cell line biomaterial. """ + cell_line_entity_id_column_name = "Identifier" + + if cell_line_entity_id_column_name not in cell_lines_df.columns: + cell_lines_df[cell_line_entity_id_column_name] = np.nan + + print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") + cell_line_entity_id = self.use_existing_envelope_and_submit_entity( 'biomaterial', cell_line.to_dict(), submission_envelope_id, access_token ) + + cell_lines_df.loc[ + cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == + cell_line.biomaterial_id, + cell_line_entity_id_column_name + ] = cell_line_entity_id + return cell_line_entity_id - def handle_differentiated_cell_lines(self, cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, sequencing_file_df, submission_envelope_id, - access_token): + def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differentiated_cell_line, + differentiated_cell_lines_df, library_preparations_df, + sequencing_file_df, submission_envelope_id, access_token): """ - Handles differentiated cell lines associated with a given cell line. + Handles a single differentiated cell line associated with a given cell line. Parameters: - cell_line: The main cell line object. - cell_line_entity_id: Entity ID of the main cell line already submitted. + - differentiated_cell_line: The differentiated cell line object. - differentiated_cell_lines_df: DataFrame containing information about differentiated cell lines. - library_preparations_df: DataFrame containing information about library preparations. + - sequencing_file_df: DataFrame containing information about Sequence files. - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - if len(cell_line.differentiated_cell_lines) > 0: - print("Cell line has differentiated cell lines, creating process to link them") - differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', {}, submission_envelope_id, access_token - ) - - differentiated_biomaterial_to_entity_id_map = {} - differentiated_cell_line_entity_id_column_name = "differentiated_cell_line_entity_id" - - if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: - differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan - - for differentiated_cell_line in cell_line.differentiated_cell_lines: - print(f"Creating Differentiated Cell Line Biomaterial: " - f"{differentiated_cell_line.biomaterial_id} as a child of Cell line: " - f"{cell_line_entity_id}") - - differentiated_entity_id = self.create_child_biomaterial( - cell_line_entity_id, - differentiated_cell_line.to_dict(), - access_token - ) - - print(f"Created Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id}") - - print(f"Linking Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id} to envelope: " - f"{submission_envelope_id}") - - self.link_entity_to_envelope( - 'biomaterial', - differentiated_entity_id, - submission_envelope_id, - access_token - ) - - print(f"Linking Cell Line Biomaterial: " - f"{cell_line_entity_id} as input to process : " - f"{differentiation_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - print(f"Linking Differentiated cell line Biomaterial: " - f"{differentiated_entity_id} as derived by process : " - f"{differentiation_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - differentiated_biomaterial_to_entity_id_map[ - differentiated_cell_line.biomaterial_id] = differentiated_entity_id - - differentiated_cell_lines_df.loc[ - differentiated_cell_lines_df[ - 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == differentiated_cell_line.biomaterial_id, - differentiated_cell_line_entity_id_column_name - ] = differentiated_entity_id - - self.handle_library_preparations(differentiated_cell_line, - differentiated_entity_id, - library_preparations_df, - sequencing_file_df, - submission_envelope_id, - access_token) - - def handle_library_preparations(self, differentiated_cell_line, differentiated_entity_id, - library_preparations_df, sequencing_file_df, submission_envelope_id, access_token): - """ - Handles library preparations associated with a differentiated cell line. + print("Cell line has differentiated cell lines, creating differentiation process to link them") + + differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('differentiation'), + submission_envelope_id, access_token + ) + + differentiated_biomaterial_to_entity_id_map = {} + differentiated_cell_line_entity_id_column_name = "Id" + + if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: + differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan + + print( + f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") + + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) + + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") + + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) + + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " + f"input to process : {differentiation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + differentiated_biomaterial_to_entity_id_map[ + differentiated_cell_line.biomaterial_id] = differentiated_entity_id + + differentiated_cell_lines_df.loc[ + differentiated_cell_lines_df[ + 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == + differentiated_cell_line.biomaterial_id, + differentiated_cell_line_entity_id_column_name + ] = differentiated_entity_id + + return differentiated_entity_id + + def handle_library_preparation(self, differentiated_entity_id, library_preparation, + library_preparations_df, submission_envelope_id, access_token): + """ + Handles a single library preparation associated with a given differentiated cell line. Parameters: - differentiated_cell_line: The differentiated cell line object. - - differentiated_entity_id: Entity ID of the differentiated cell line. + - differentiated_entity_id: Entity ID of the differentiated cell line already submitted. + - library_preparation: The library preparation object. - library_preparations_df: DataFrame containing information about library preparations. + - sequencing_file_df: DataFrame containing information about sequencing files. - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - if len(differentiated_cell_line.library_preparations) > 0: - print("Differentiated cell line has library preparation biomaterials, creating process to link them") - library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', {}, submission_envelope_id, access_token - ) - - library_preparation_biomaterial_to_entity_id_map = {} - library_preparation_biomaterial_entity_id_column_name = " library_preparation_biomaterial_entity_id" - - if library_preparation_biomaterial_entity_id_column_name not in library_preparations_df.columns: - library_preparations_df[library_preparation_biomaterial_entity_id_column_name] = np.nan - - for library_preparation in differentiated_cell_line.library_preparations: - print(f"Creating Library Preparation Biomaterial: " - f"{library_preparation.biomaterial_id} as a child of Differentiated Cell line: " - f"{differentiated_entity_id}") - - library_preparation_entity_id = self.create_child_biomaterial( - differentiated_entity_id, - library_preparation.to_dict(), - access_token - ) - - print(f"Created Library preparation Biomaterial: " - f"{library_preparation_entity_id}") - - print(f"Linking Library preparation Biomaterial: " - f"{library_preparation_entity_id} to envelope: " - f"{submission_envelope_id}") - - self.link_entity_to_envelope( - 'biomaterial', - library_preparation_entity_id, - submission_envelope_id, - access_token - ) - - print(f"Linking Differentiation cell line Biomaterial: " - f"{differentiated_entity_id} as input to process : " - f"{library_preparation_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - print(f"Linking Library preparation Biomaterial: " - f"{library_preparation_entity_id} as derived by process : " - f"{library_preparation_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - library_preparation_biomaterial_to_entity_id_map[ - library_preparation.biomaterial_id] = library_preparation_entity_id - - library_preparations_df.loc[ - library_preparations_df[ - 'library_preparation.biomaterial_core.biomaterial_id'] == library_preparation.biomaterial_id, - library_preparation_biomaterial_entity_id_column_name - ] = library_preparation_entity_id - - self.handle_sequence_files(library_preparation, - library_preparation_entity_id, - sequencing_file_df, - submission_envelope_id, - access_token) - - def handle_sequence_files(self, library_preparation, library_preparation_entity_id, - sequencing_file_df, - submission_envelope_id, - access_token): - if len(library_preparation.sequencing_files) > 0: - print("library preparation has sequencing files, creating process to link them") - sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', {}, submission_envelope_id, access_token - ) - - sequencing_file_to_entity_id_map = {} - sequencing_file_entity_id_column_name = "sequencing_file_entity_id" - - if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: - sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan - - for sequencing_file in library_preparation.sequencing_files: - print(f"Creating Sequencing file: " - f"{sequencing_file.file_name} as a result of sequencing the " - f"Library preparation biomaterial: " - f"{library_preparation_entity_id}") - - sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( - 'file', - sequencing_file.to_dict(), - submission_envelope_id, - access_token - ) - - print(f"Created Sequencing file: " - f"{sequencing_file_entity_id}") - - print(f"Linking Library preparation Biomaterial: " - f"{library_preparation_entity_id} as input to process : " - f"{sequencing_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - print(f"Linking Sequencing file: " - f"{sequencing_file_entity_id} as derived by process : " - f"{sequencing_process_entity_id}") - - self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - sequencing_file_df.loc[ - sequencing_file_df[ - 'sequence_file.file_core.file_name'] == sequencing_file.file_name, - sequencing_file_entity_id_column_name - ] = sequencing_file_entity_id - - sequencing_file_to_entity_id_map[ - sequencing_file.file_name] = sequencing_file_entity_id + print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " + f"{differentiated_entity_id}") + + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) + + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"to envelope: {submission_envelope_id}") + + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"as input to library preparation process") + + library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('library_preparation'), + submission_envelope_id, access_token + ) + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"as derived by library preparation process") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + library_preparations_df.loc[ + library_preparations_df[ + 'library_preparation.biomaterial_core.biomaterial_id'] == + library_preparation.biomaterial_id, + 'Id' + ] = library_preparation_entity_id + + return library_preparation_entity_id + + def handle_sequencing_file(self, library_preparation, library_preparation_entity_id, sequencing_file, + sequencing_file_df, submission_envelope_id, access_token): + """ + Handles a single sequencing file associated with a given library preparation. + + Parameters: + - library_preparation: The library preparation object. + - library_preparation_entity_id: Entity ID of the library preparation already submitted. + - sequencing_file: The sequencing file object. + - sequencing_file_df: DataFrame containing information about sequencing files. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ + print("Creating sequencing process to link the sequencing file") + + sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('sequencing'), + submission_envelope_id, + access_token + ) + + sequencing_file_entity_id_column_name = "Id" + + if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: + sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan + + print( + f"Creating Sequencing file: {sequencing_file.file_name} " + f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") + + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) + + print(f"Created Sequencing file: {sequencing_file_entity_id}") + + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " + f"as input to process: {sequencing_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Sequencing file: {sequencing_file_entity_id} " + f"as derived by process: {sequencing_process_entity_id}") + + self.perform_hal_linkage( + f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + sequencing_file_df.loc[ + sequencing_file_df[ + 'sequence_file.file_core.file_name'] == sequencing_file.file_name, + sequencing_file_entity_id_column_name + ] = sequencing_file_entity_id def multi_type_submission(self, cell_lines, cell_lines_df, differentiated_cell_lines_df, @@ -327,7 +341,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, access_token): """ - Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) + Handles the submission of multiple types of biomaterials (cell lines, + differentiated cell lines, library preparations) to a specified submission envelope. Parameters: @@ -335,31 +350,60 @@ def multi_type_submission(self, cell_lines, cell_lines_df, - cell_lines_df: DataFrame for tracking cell line entity IDs. - differentiated_cell_lines_df: DataFrame for tracking differentiated cell line entity IDs. - library_preparations_df: DataFrame for tracking library preparation entity IDs. + - sequencing_file_df: DataFrame for tracking sequencing file entity IDs. - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. Returns: - - Updated versions of cell_lines_df, differentiated_cell_lines_df, and library_preparations_df after submission. - """ - cell_line_entity_id_column_name = "cell_line_entity_id" - - if cell_line_entity_id_column_name not in cell_lines_df.columns: - cell_lines_df[cell_line_entity_id_column_name] = np.nan - - for cell_line in cell_lines: - print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") - - cell_line_entity_id = self.submit_cell_line(cell_line, submission_envelope_id, access_token) - cell_lines_df.loc[ - cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == cell_line.biomaterial_id, - cell_line_entity_id_column_name - ] = cell_line_entity_id - - self.handle_differentiated_cell_lines(cell_line, cell_line_entity_id, differentiated_cell_lines_df, - library_preparations_df, sequencing_file_df, submission_envelope_id, - access_token) - - return cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df + - Tuple containing updated DataFrames and a status message. + """ + try: + for cell_line in cell_lines: + + cell_line_entity_id = self.handle_cell_line(cell_line, + cell_lines_df, + submission_envelope_id, + access_token) + + for differentiated_cell_line in cell_line.differentiated_cell_lines: + differentiated_entity_id = self.handle_differentiated_cell_line(cell_line, cell_line_entity_id, + differentiated_cell_line, + differentiated_cell_lines_df, + library_preparations_df, + sequencing_file_df, + submission_envelope_id, + access_token) + + for library_preparation in differentiated_cell_line.library_preparations: + library_preparation_entity_id = self.handle_library_preparation(differentiated_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + access_token) + + for sequencing_file in library_preparation.sequencing_files: + self.handle_sequencing_file(library_preparation, + library_preparation_entity_id, + sequencing_file, + sequencing_file_df, + submission_envelope_id, + access_token) + + message = 'SUCCESS' + except Exception as e: + message = f"An error occurred: {str(e)}" + traceback.print_exc() + # Set DataFrames to None in case of an error + cell_lines_df = None + differentiated_cell_lines_df = None + library_preparations_df = None + sequencing_file_df = None + + return (cell_lines_df, + differentiated_cell_lines_df, + library_preparations_df, + sequencing_file_df, + message) def typed_submission(self, type, file, access_token): """ @@ -387,7 +431,8 @@ def typed_submission(self, type, file, access_token): study_id = self.args.study self.link_dataset_to_study(entity_id, study_id, access_token) else: - link_to_study = input("Do you want to link this dataset to a study? (yes/no): ").lower() + link_to_study = input("Do you want to link this dataset to a study? " + "(yes/no): ").lower() if link_to_study == 'yes': study_id = input("Input study id: ").lower() self.link_dataset_to_study(entity_id, study_id, access_token) @@ -397,7 +442,8 @@ def typed_submission(self, type, file, access_token): dataset_id = self.args.dataset self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) else: - link_to_dataset = input("Do you want to link this biomaterial to a dataset? (yes/no): ").lower() + link_to_dataset = input("Do you want to link this biomaterial to a " + "dataset? (yes/no): ").lower() if link_to_dataset == 'yes': dataset_id = input("Input dataset id: ").lower() self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) @@ -417,7 +463,8 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ Creates and submits a new entity (study, dataset, biomaterial, or process) and returns its ID. Parameters: - input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + input_entity_type (str): The type of entity to create ('study', 'dataset', + 'biomaterial', 'process'). data (dict): The data to be submitted. access_token (str): Access token for authorization. @@ -433,8 +480,10 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ elif input_entity_type == 'process': halEntity = 'processes' - entity_create_url_from_sub_env_hal_links = self.post_to_provider_api(self.submission_envelope_create_url, - halEntity, None, access_token) + entity_create_url_from_sub_env_hal_links = (self. + post_to_provider_api(self.submission_envelope_create_url, + halEntity, None, + access_token)) entity_self_hal_link = self.post_to_provider_api(entity_create_url_from_sub_env_hal_links, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) @@ -443,12 +492,14 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ return entity_id - def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submission_envelope_id, access_token): + def use_existing_envelope_and_submit_entity(self, input_entity_type, data, + submission_envelope_id, access_token): """ Submits an entity using an existing submission envelope and returns its ID. Parameters: - input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + input_entity_type (str): The type of entity to create ('study', + 'dataset', 'biomaterial', 'process'). data (dict): The data to be submitted. submission_envelope_id (str): ID of the submission envelope. access_token (str): Access token for authorization. @@ -489,7 +540,8 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): """ print("Linking dataset " + dataset_id + " to study " + study_id) - self.put_to_provider_api(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", access_token) + self.put_to_provider_api(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", + access_token) print("Dataset linked successfully to study: " + study_id) @@ -546,13 +598,39 @@ def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): return url + def delete_submission(self, submission_envelope_id, access_token, force_delete=False): + """ + Sends a DELETE request to delete a submission envelope. + + Parameters: + - submission_envelope_id (str): ID of the submission envelope to delete. + - access_token (str): Access token for authorization. + - force_delete (bool): Whether to force delete the submission envelope (default: False). + + Returns: + - str: The URL from the response. + """ + url = f"{self.submission_envelope_base_url}/{submission_envelope_id}" + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + params = { + 'force': 'true' if force_delete else 'false' + } + + response = requests.delete(url, headers=headers, params=params) + response_data = response.json() + + return response_data + def post_to_provider_api_and_get_entity_id(self, url, data, access_token): """ Sends a POST request to the specified URL. Parameters: url (str): The URL to send the request to. - data_type_in_hal_link (str): The data type in the HAL link. data (dict): The data to be sent in the POST request. access_token (str): Access token for authorization. @@ -598,7 +676,6 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): Parameters: url (str): The URL to send the request to. input_id (str): The ID of the input entity. - link_this (str): The entity to link. link_to (str): The entity to link to. access_token (str): Access token for authorization. diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 80c85e7..12cf890 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,66 +1,30 @@ # Import necessary modules/classes from ait.commons.util package + import pandas as pd +from ait.commons.util.aws_client import Aws +from ait.commons.util.command.list import CmdList from ait.commons.util.command.submit import CmdSubmit, get_id_from_url from ait.commons.util.user_profile import get_profile from ait.commons.util.util.spreadsheet_util import SpreadsheetSubmitter # Define a class for handling submission of a command file -class CmdSubmitFile: - # Column mappings for parsing different sections of the spreadsheet - cellline_column_mapping = { - "CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", - "CELL LINE DESCRIPTION": "cell_line.biomaterial_core.biomaterial_description", - "DERIVED FROM CELL LINE NAME (Required)": "cell_line.derived_cell_line_accession", - "CLONE ID": "cell_line.clone_id", - "GENE EXPRESSION ALTERATION PROTOCOL ID": "gene_expression_alteration_protocol.protocol_core.protocol_id", - "ZYGOSITY": "cell_line.zygosity", - "CELL LINE TYPE (Required)": "cell_line.type", - "Unnamed: 7": None, - "Unnamed: 8": None - } - - differentiated_cellline_column_mapping = { - "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", - "DIFFERENTIATED CELL LINE DESCRIPTION": "differentiated_cell_line.biomaterial_core.biomaterial_description", - "INPUT CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", - "DIFFERENTIATION PROTOCOL ID (Required)": "differentiation_protocol.protocol_core.protocol_id", - "TIMEPOINT VALUE": "differentiated_cell_line.timepoint_value", - "TIMEPOINT UNIT": "differentiated_cell_line.timepoint_unit.text", - "TERMINALLY DIFFERENTIATED": "differentiated_cell_line.terminally_differentiated", - "FINAL LINEAGE STAGE": "differentiated_cell_line.terminally_differentiated", - "Model System": "cell_line.model_organ.text", - "MODEL SYSTEM": "cell_line.model_organ.text", - "Unnamed: 8": None - } - - library_preparation_column_mapping = { - "LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", - "LIBRARY PREPARATION PROTOCOL ID (Required)": "library_preparation_protocol.protocol_core.protocol_id", - "DISSOCIATION PROTOCOL ID (Required)": "dissociation_protocol.protocol_core.protocol_id", - "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", - "LIBRARY AVERAGE FRAGMENT SIZE": "library_preparation.average_fragment_size", - "LIBRARY INPUT AMOUNT VALUE": "library_preparation.input_amount_value", - "LIBRARY INPUT AMOUNT UNIT": "library_preparation.input_amount_unit", - "LIBRARY FINAL YIELD VALUE": "library_preparation.final_yield_value", - "LIBRARY FINAL YIELD UNIT": "library_preparation.final_yield_unit", - "LIBRARY CONCENTRATION VALUE": "library_preparation.concentration_value", - "LIBRARY CONCENTRATION UNIT": "library_preparation.concentration_unit", - "LIBRARY PCR CYCLES": "library_preparation.pcr_cycles", - "LIBRARY PCR CYCLES FOR SAMPLE INDEX": "library_preparation.pcr_cycles_for_sample_index", - "Unnamed: 14": None # Adjust index based on your actual column count - } - - sequencing_file_column_mapping = { - "FILE NAME (Required)": "sequence_file.file_core.file_name", - "INPUT LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", - "SEQUENCING PROTOCOL ID (Required)": "sequencing_protocol.protocol_core.protocol_id", - "READ INDEX (Required)": "sequence_file.read_index", - "RUN ID": "sequence_file.run_id", - "Unnamed: 5": None # Adjust index based on your actual column count - } +def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, dataset): + for sequencing_file in sequencing_files: + match_found = False # Flag to indicate if a match is found + + for file_key in list_of_files_in_upload_area: + if sequencing_file.file_name == file_key: + match_found = True + break # Exit the inner loop if a match is found + + if not match_found: + raise Exception(f"No matching file found for sequencing file: {sequencing_file.file_name} in the " + f"upload area for the dataset: {dataset}") + +class CmdSubmitFile: base_url = 'http://localhost:8080' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" @@ -74,38 +38,58 @@ def __init__(self, args): """ self.args = args self.access_token = get_profile('morphic-util').access_token + self.user_profile = get_profile('morphic-util') + self.aws = Aws(self.user_profile) if hasattr(self.args, 'file') and self.args.file is not None: self.file = self.args.file else: - self.file = None + raise Exception("File is mandatory") + + if hasattr(self.args, 'action') and self.args.action is not None: + self.action = self.args.action + else: + raise Exception("Submission action (ADD, MODIFY or DELETE) is mandatory") + + if hasattr(self.args, 'dataset') and self.args.dataset is not None: + self.dataset = self.args.dataset + else: + raise Exception("Dataset is mandatory to be registered before submitting dataset metadata, " + "We request you to submit your study using the submit option, register your" + "dataset using the same option and link your dataset to your study" + "before proceeding with this submission.") def run(self): """ Execute the command file submission process. """ submission_instance = CmdSubmit(self) + list_instance = CmdList(self.aws, self.args) + + list_of_files_in_upload_area = (list_instance. + list_bucket_contents_and_return(self.dataset, '')) + + print("Files in the upload area:") + + for file_key in list_of_files_in_upload_area: + print(file_key) if self.file: # Initialize SpreadsheetParser with the provided file path parser = SpreadsheetSubmitter(self.file) - # print(parser.list_sheets()) - # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', - self.cellline_column_mapping) - + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line') differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', - self.differentiated_cellline_column_mapping) + 'Differentiated cell line') parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) - library_preparations, library_preparations_df = parser.get_library_preparations('Library preparation', - self.library_preparation_column_mapping) + library_preparations, library_preparations_df = parser.get_library_preparations('Library preparation') parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) - sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', - self.sequencing_file_column_mapping) + sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file') + + validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) + parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) # Print each CellLine object in CellLineMaster @@ -118,20 +102,35 @@ def run(self): print("Submission envelope for this submission is: " + submission_envelope_id) # Perform the submission and get the updated dataframes - (updated_cell_lines_df, updated_differentiated_cell_lines_df, - updated_library_preparations_df, updated_sequencing_files_df) = submission_instance.multi_type_submission( - cell_lines, - cell_lines_df, - differentiated_cell_lines_df, - library_preparations_df, - sequencing_files_df, - submission_envelope_id, - self.access_token - ) - - # Save both dataframes to a single Excel file with multiple sheets - with pd.ExcelWriter("updated_cell_lines.xlsx") as writer: - updated_cell_lines_df.to_excel(writer, sheet_name='CellLines', index=False) - updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='DifferentiatedCellLines', index=False) - updated_library_preparations_df.to_excel(writer, sheet_name='Library Preparations', index=False) - updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence files', index=False) + try: + (updated_cell_lines_df, updated_differentiated_cell_lines_df, + updated_library_preparations_df, + updated_sequencing_files_df, message) = submission_instance.multi_type_submission( + cell_lines, + cell_lines_df, + differentiated_cell_lines_df, + library_preparations_df, + sequencing_files_df, + submission_envelope_id, + self.access_token + ) + + # Save the updated dataframes to a single Excel file with multiple sheets + if message == 'SUCCESS': + output_file = "updated_cell_lines.xlsx" + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) + updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', + index=False) + updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', index=False) + updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) + + return True, message + else: + print("Submission has failed") + # submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + return False, "Submission has failed, rolled back" + except Exception as e: + print("Submission has failed") + # submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + return False, f"An error occurred: {str(e)}" diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 3e61f3f..7c360e5 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -248,7 +248,7 @@ def list_sheets(self): xls = pd.ExcelFile(self.file_path, engine='openpyxl') return xls.sheet_names - def parse_cell_lines(self, sheet_name, column_mapping): + def parse_cell_lines(self, sheet_name): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -266,9 +266,11 @@ def parse_cell_lines(self, sheet_name, column_mapping): - list of CellLine objects parsed from the specified sheet. - pd.DataFrame with the parsed data. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + + # Remove unnamed columns (columns without headers) + df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: @@ -277,16 +279,6 @@ def parse_cell_lines(self, sheet_name, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] - # Filter column_mapping to include only keys that exist in df.columns - columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if - col_mapping_val in df.columns] - - if not columns_to_select: - raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") - - # Select only columns that are present in df - df = df[columns_to_select] - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] @@ -329,7 +321,7 @@ def parse_cell_lines(self, sheet_name, column_mapping): return cell_lines, df_filtered - def parse_differentiated_cell_lines(self, sheet_name, column_mapping): + def parse_differentiated_cell_lines(self, sheet_name): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -345,9 +337,11 @@ def parse_differentiated_cell_lines(self, sheet_name, column_mapping): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + + # Remove unnamed columns (columns without headers) + df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: @@ -356,16 +350,6 @@ def parse_differentiated_cell_lines(self, sheet_name, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] - # Filter column_mapping to include only keys that exist in df.columns - columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if - col_mapping_val in df.columns] - - if not columns_to_select: - raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") - - # Select only columns that are present in df - df = df[columns_to_select] - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] @@ -394,7 +378,7 @@ def parse_differentiated_cell_lines(self, sheet_name, column_mapping): return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, column_mapping): + def parse_library_preparations(self, sheet_name): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -410,9 +394,11 @@ def parse_library_preparations(self, sheet_name, column_mapping): list A list of LibraryPreparation objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + + # Remove unnamed columns (columns without headers) + df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'library_preparation.biomaterial_core.biomaterial_id' not in df.columns: @@ -424,16 +410,6 @@ def parse_library_preparations(self, sheet_name, column_mapping): # TODO: for all df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Filter column_mapping to include only keys that exist in df.columns - columns_to_select = [col_mapping_val for col_mapping_key, col_mapping_val in column_mapping.items() if - col_mapping_val in df.columns] - - if not columns_to_select: - raise ValueError("No valid columns found in the column_mapping that exist in the DataFrame.") - - # Select only columns that are present in df - df = df[columns_to_select] - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['library_preparation.biomaterial_core.biomaterial_id'] @@ -467,7 +443,7 @@ def parse_library_preparations(self, sheet_name, column_mapping): return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, column_mapping): + def parse_sequencing_files(self, sheet_name): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -483,9 +459,11 @@ def parse_sequencing_files(self, sheet_name, column_mapping): list A list of SequencingFile objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl') + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + + # Remove unnamed columns (columns without headers) + df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'sequence_file.file_core.file_name' not in df.columns: @@ -494,9 +472,6 @@ def parse_sequencing_files(self, sheet_name, column_mapping): # Filter rows where file_name is not null df = df[df['sequence_file.file_core.file_name'].notna()] - # Select only columns that are not None in the column_mapping - df = df[[col for col in column_mapping.values() if col is not None]] - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['sequence_file.file_core.file_name'] @@ -523,7 +498,7 @@ def parse_sequencing_files(self, sheet_name, column_mapping): return sequencing_files, df_filtered - def get_cell_lines(self, sheet_name, column_mapping): + def get_cell_lines(self, sheet_name): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -539,10 +514,10 @@ def get_cell_lines(self, sheet_name, column_mapping): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, column_mapping) + cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name) return cell_lines, cell_lines_df - def get_differentiated_cell_lines(self, sheet_name, column_mapping): + def get_differentiated_cell_lines(self, sheet_name): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -558,8 +533,7 @@ def get_differentiated_cell_lines(self, sheet_name, column_mapping): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name, - column_mapping) + differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name) return differentiated_cell_lines, differentiated_cell_lines_df def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): @@ -583,6 +557,7 @@ def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiate If a differentiated cell line does not have a corresponding cell line. """ cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} + for differentiated_cell_line in differentiated_cell_lines: if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: raise MissingEntityError("Cell Line", @@ -615,6 +590,7 @@ def merge_differentiated_cell_line_and_library_preparation(self, differentiated_ If a library preparation does not have a corresponding differentiated cell line. """ differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} + for library_preparation in library_preparations: if library_preparation.differentiated_biomaterial_id not in differentiated_ids: raise MissingEntityError("Differentiated Cell Line", @@ -647,6 +623,7 @@ def merge_library_preparation_sequencing_file(self, library_preparations, sequen If a sequencing file does not have a corresponding library preparation. """ library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} + for sequencing_file in sequencing_files: if sequencing_file.library_preparation_id not in library_ids: raise MissingEntityError("Library preparation", @@ -658,7 +635,7 @@ def merge_library_preparation_sequencing_file(self, library_preparations, sequen if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: library_preparation.add_sequencing_file(sequencing_file) - def get_library_preparations(self, sheet_name, column_mapping): + def get_library_preparations(self, sheet_name): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -674,10 +651,10 @@ def get_library_preparations(self, sheet_name, column_mapping): list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations, df_filtered = self.parse_library_preparations(sheet_name, column_mapping) + library_preparations, df_filtered = self.parse_library_preparations(sheet_name) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, column_mapping): + def get_sequencing_files(self, sheet_name): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -693,5 +670,5 @@ def get_sequencing_files(self, sheet_name, column_mapping): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, column_mapping) + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name) return sequencing_files, df_filtered From 7049ee627b53644f2073dc9cd80a1a6c0d613807 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 25 Jun 2024 15:34:15 +0100 Subject: [PATCH 18/55] update action --- ait/commons/util/command/submit.py | 407 +++++++++++++--------- ait/commons/util/command/submit_file.py | 54 +-- ait/commons/util/util/spreadsheet_util.py | 79 +++-- 3 files changed, 320 insertions(+), 220 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 1610280..6ab7946 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -86,7 +86,7 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, access_token): + def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, access_token, action): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -99,31 +99,37 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, acc Returns: - cell_line_entity_id: Entity ID of the submitted cell line biomaterial. """ - cell_line_entity_id_column_name = "Identifier" + if action == 'modify' or action == 'MODIFY': + success = self.patchEntity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) - if cell_line_entity_id_column_name not in cell_lines_df.columns: - cell_lines_df[cell_line_entity_id_column_name] = np.nan + if success: + print("Updated cell line: " + cell_line.id + " / " + cell_line.biomaterial_id) + else: + cell_line_entity_id_column_name = "Identifier" + + if cell_line_entity_id_column_name not in cell_lines_df.columns: + cell_lines_df[cell_line_entity_id_column_name] = np.nan - print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") + print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") - cell_line_entity_id = self.use_existing_envelope_and_submit_entity( - 'biomaterial', - cell_line.to_dict(), - submission_envelope_id, - access_token - ) + cell_line_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + cell_line.to_dict(), + submission_envelope_id, + access_token + ) - cell_lines_df.loc[ - cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == - cell_line.biomaterial_id, - cell_line_entity_id_column_name - ] = cell_line_entity_id + cell_lines_df.loc[ + cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == + cell_line.biomaterial_id, + cell_line_entity_id_column_name + ] = cell_line_entity_id - return cell_line_entity_id + return cell_line_entity_id def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differentiated_cell_line, - differentiated_cell_lines_df, library_preparations_df, - sequencing_file_df, submission_envelope_id, access_token): + differentiated_cell_lines_df, submission_envelope_id, access_token + , action): """ Handles a single differentiated cell line associated with a given cell line. @@ -137,74 +143,82 @@ def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differ - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - print("Cell line has differentiated cell lines, creating differentiation process to link them") + if action == 'modify' or action == 'MODIFY': + success = self.patchEntity('biomaterial', differentiated_cell_line.id, differentiated_cell_line.to_dict(), + access_token) - differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('differentiation'), - submission_envelope_id, access_token - ) + if success: + print("Updated differentiated cell line: " + differentiated_cell_line.id + " / " + + differentiated_cell_line.biomaterial_id) + else: + print("Cell line has differentiated cell lines, creating differentiation process to link them") - differentiated_biomaterial_to_entity_id_map = {} - differentiated_cell_line_entity_id_column_name = "Id" + differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('differentiation'), + submission_envelope_id, access_token + ) - if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: - differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan + differentiated_biomaterial_to_entity_id_map = {} + differentiated_cell_line_entity_id_column_name = "Id" - print( - f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " - f"as a child of Cell line: {cell_line_entity_id}") + if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: + differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan - differentiated_entity_id = self.create_child_biomaterial( - cell_line_entity_id, - differentiated_cell_line.to_dict(), - access_token - ) + print( + f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") - print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to envelope: {submission_envelope_id}") + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - self.link_entity_to_envelope( - 'biomaterial', - differentiated_entity_id, - submission_envelope_id, - access_token - ) + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") - print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " - f"input to process : {differentiation_process_entity_id}") + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " + f"input to process : {differentiation_process_entity_id}") - print( - f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + print( + f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") - differentiated_biomaterial_to_entity_id_map[ - differentiated_cell_line.biomaterial_id] = differentiated_entity_id + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) - differentiated_cell_lines_df.loc[ - differentiated_cell_lines_df[ - 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == - differentiated_cell_line.biomaterial_id, - differentiated_cell_line_entity_id_column_name - ] = differentiated_entity_id + differentiated_biomaterial_to_entity_id_map[ + differentiated_cell_line.biomaterial_id] = differentiated_entity_id - return differentiated_entity_id + differentiated_cell_lines_df.loc[ + differentiated_cell_lines_df[ + 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == + differentiated_cell_line.biomaterial_id, + differentiated_cell_line_entity_id_column_name + ] = differentiated_entity_id + + return differentiated_entity_id def handle_library_preparation(self, differentiated_entity_id, library_preparation, - library_preparations_df, submission_envelope_id, access_token): + library_preparations_df, submission_envelope_id, access_token, action): """ Handles a single library preparation associated with a given differentiated cell line. @@ -217,62 +231,71 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id}") - - library_preparation_entity_id = self.create_child_biomaterial( - differentiated_entity_id, - library_preparation.to_dict(), - access_token - ) - - print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to envelope: {submission_envelope_id}") - - self.link_entity_to_envelope( - 'biomaterial', - library_preparation_entity_id, - submission_envelope_id, - access_token - ) - - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"as input to library preparation process") - - library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('library_preparation'), - submission_envelope_id, access_token - ) + if action == 'modify' or action == 'MODIFY': + success = self.patchEntity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"as derived by library preparation process") - - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - library_preparations_df.loc[ - library_preparations_df[ - 'library_preparation.biomaterial_core.biomaterial_id'] == - library_preparation.biomaterial_id, - 'Id' - ] = library_preparation_entity_id - - return library_preparation_entity_id + if success: + print("Updated library preparation biomaterial: " + library_preparation.id + " / " + + library_preparation.biomaterial_id) + else: + print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " + f"{differentiated_entity_id}") + + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) + + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"to envelope: {submission_envelope_id}") + + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"as input to library preparation process") + + library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('library_preparation'), + submission_envelope_id, access_token + ) + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"as derived by library preparation process") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + library_preparations_df.loc[ + library_preparations_df[ + 'library_preparation.biomaterial_core.biomaterial_id'] == + library_preparation.biomaterial_id, + 'Id' + ] = library_preparation_entity_id + + return library_preparation_entity_id def handle_sequencing_file(self, library_preparation, library_preparation_entity_id, sequencing_file, - sequencing_file_df, submission_envelope_id, access_token): + sequencing_file_df, submission_envelope_id, access_token, action): """ Handles a single sequencing file associated with a given library preparation. @@ -284,62 +307,72 @@ def handle_sequencing_file(self, library_preparation, library_preparation_entity - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - print("Creating sequencing process to link the sequencing file") + if action == 'modify' or action == 'MODIFY': + success = self.patchEntity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) - sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('sequencing'), - submission_envelope_id, - access_token - ) + if success: + print("Updated sequencing file: " + sequencing_file.id + " / " + + sequencing_file.file_name) + else: + print("Creating sequencing process to link the sequencing file") + + sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', get_process_content('sequencing'), + submission_envelope_id, + access_token + ) - sequencing_file_entity_id_column_name = "Id" + sequencing_file_entity_id_column_name = "Id" - if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: - sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan + if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: + sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan - print( - f"Creating Sequencing file: {sequencing_file.file_name} " - f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") + print( + f"Creating Sequencing file: {sequencing_file.file_name} " + f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") - sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( - 'file', - sequencing_file.to_dict(), - submission_envelope_id, - access_token - ) + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) - print(f"Created Sequencing file: {sequencing_file_entity_id}") + print(f"Created Sequencing file: {sequencing_file_entity_id}") - print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " - f"as input to process: {sequencing_process_entity_id}") + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " + f"as input to process: {sequencing_process_entity_id}") - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", - sequencing_process_entity_id, 'processes', access_token - ) + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) - print( - f"Linking Sequencing file: {sequencing_file_entity_id} " - f"as derived by process: {sequencing_process_entity_id}") + print( + f"Linking Sequencing file: {sequencing_file_entity_id} " + f"as derived by process: {sequencing_process_entity_id}") - self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", - sequencing_process_entity_id, 'processes', access_token - ) + self.perform_hal_linkage( + f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) - sequencing_file_df.loc[ - sequencing_file_df[ - 'sequence_file.file_core.file_name'] == sequencing_file.file_name, - sequencing_file_entity_id_column_name - ] = sequencing_file_entity_id + sequencing_file_df.loc[ + sequencing_file_df[ + 'sequence_file.file_core.file_name'] == sequencing_file.file_name, + sequencing_file_entity_id_column_name + ] = sequencing_file_entity_id def multi_type_submission(self, cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df, submission_envelope_id, - access_token): + access_token, + action): """ Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) @@ -363,23 +396,24 @@ def multi_type_submission(self, cell_lines, cell_lines_df, cell_line_entity_id = self.handle_cell_line(cell_line, cell_lines_df, submission_envelope_id, - access_token) + access_token, + action) for differentiated_cell_line in cell_line.differentiated_cell_lines: differentiated_entity_id = self.handle_differentiated_cell_line(cell_line, cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, - library_preparations_df, - sequencing_file_df, submission_envelope_id, - access_token) + access_token, + action) for library_preparation in differentiated_cell_line.library_preparations: library_preparation_entity_id = self.handle_library_preparation(differentiated_entity_id, library_preparation, library_preparations_df, submission_envelope_id, - access_token) + access_token, + action) for sequencing_file in library_preparation.sequencing_files: self.handle_sequencing_file(library_preparation, @@ -387,7 +421,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, sequencing_file, sequencing_file_df, submission_envelope_id, - access_token) + access_token, + action) message = 'SUCCESS' except Exception as e: @@ -417,7 +452,7 @@ def typed_submission(self, type, file, access_token): Returns: tuple: A tuple containing a boolean indicating success and the ID of the created entity. """ - if type in ['study', 'dataset', 'biomaterial', 'process']: + if type in ['study', 'dataset', 'biomaterial', 'process', 'file']: if file is not None: data = self.transform(file) else: @@ -492,6 +527,34 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ return entity_id + def patchEntity(self, input_entity_type, id, data, access_token): + if input_entity_type == 'study': + halEntity = 'studies' + elif input_entity_type == 'dataset': + halEntity = 'datasets' + elif input_entity_type == 'biomaterial': + halEntity = 'biomaterials' + elif input_entity_type == 'process': + halEntity = 'processes' + elif input_entity_type == 'file': + halEntity = 'files' + + entity_patch_url = self.base_url + '/' + halEntity + '/' + id + + return self.patch_to_provider_api(entity_patch_url, data, access_token) + + def patch_to_provider_api(self, entity_patch_url, data, access_token): + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.patch(entity_patch_url, headers=headers, json=data) + + if response.status_code // 100 == 2: + return True + return False + def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submission_envelope_id, access_token): """ @@ -621,9 +684,11 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F } response = requests.delete(url, headers=headers, params=params) - response_data = response.json() - return response_data + # Check if the status code indicates success (2xx) + if response.status_code // 100 == 2: + return True + return False def post_to_provider_api_and_get_entity_id(self, url, data, access_token): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 12cf890..68594e0 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -69,37 +69,36 @@ def run(self): list_of_files_in_upload_area = (list_instance. list_bucket_contents_and_return(self.dataset, '')) - print("Files in the upload area:") - - for file_key in list_of_files_in_upload_area: - print(file_key) - if self.file: # Initialize SpreadsheetParser with the provided file path parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line') + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line') + 'Differentiated cell line', self.action) parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) - library_preparations, library_preparations_df = parser.get_library_preparations('Library preparation') + library_preparations, library_preparations_df = (parser + .get_library_preparations('Library preparation', + self.action)) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) - sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file') + sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', self.action) validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) - # Print each CellLine object in CellLineMaster - submission_envelope_response = submission_instance.create_new_submission_envelope( - self.submission_envelope_create_url, - access_token=self.access_token) - self_url = submission_envelope_response['_links']['self']['href'] - submission_envelope_id = get_id_from_url(self_url) + if self.action != 'modify' and self.action != 'MODIFY': + submission_envelope_response = submission_instance.create_new_submission_envelope( + self.submission_envelope_create_url, + access_token=self.access_token) + self_url = submission_envelope_response['_links']['self']['href'] + submission_envelope_id = get_id_from_url(self_url) - print("Submission envelope for this submission is: " + submission_envelope_id) + print("Submission envelope for this submission is: " + submission_envelope_id) + else: + submission_envelope_id = None # Perform the submission and get the updated dataframes try: @@ -112,25 +111,30 @@ def run(self): library_preparations_df, sequencing_files_df, submission_envelope_id, - self.access_token + self.access_token, + self.action ) # Save the updated dataframes to a single Excel file with multiple sheets if message == 'SUCCESS': output_file = "updated_cell_lines.xlsx" with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) - updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', + updated_cell_lines_df.to_excel(writer, + sheet_name='Cell line', index=False) + updated_differentiated_cell_lines_df.to_excel(writer, + sheet_name='Differentiated cell line', index=False) - updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', index=False) - updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) + updated_library_preparations_df.to_excel(writer, + sheet_name='Library preparation', index=False) + updated_sequencing_files_df.to_excel(writer, + sheet_name='Sequence file', index=False) return True, message else: - print("Submission has failed") - # submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + print("Submission has failed, rolling back") + submission_instance.delete_submission(submission_envelope_id, self.access_token, True) return False, "Submission has failed, rolled back" except Exception as e: - print("Submission has failed") - # submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + print("Submission has failed, rolling back") + submission_instance.delete_submission(submission_envelope_id, self.access_token, True) return False, f"An error occurred: {str(e)}" diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 7c360e5..0f74705 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -20,7 +20,8 @@ def __init__(self, missing_type, entity_type, missing_id): class CellLine: - def __init__(self, biomaterial_id, description, derived_accession, clone_id, protocol_id, zygosity, cell_type): + def __init__(self, biomaterial_id, description, derived_accession, + clone_id, protocol_id, zygosity, cell_type, id): self.biomaterial_id = biomaterial_id self.description = description self.derived_accession = derived_accession @@ -29,6 +30,7 @@ def __init__(self, biomaterial_id, description, derived_accession, clone_id, pro self.zygosity = zygosity self.cell_type = cell_type self.differentiated_cell_lines = [] + self.id = id def add_differentiated_cell_line(self, differentiated_cell_line): self.differentiated_cell_lines.append(differentiated_cell_line) @@ -52,7 +54,7 @@ def to_dict(self): class DifferentiatedCellLine: def __init__(self, biomaterial_id, description, input_biomaterial_id, protocol_id, timepoint_value, timepoint_unit, - terminally_differentiated, model_system): + terminally_differentiated, model_system, id): self.biomaterial_id = biomaterial_id self.description = description self.input_biomaterial_id = input_biomaterial_id @@ -62,6 +64,7 @@ def __init__(self, biomaterial_id, description, input_biomaterial_id, protocol_i self.terminally_differentiated = terminally_differentiated self.model_system = model_system self.library_preparations = [] + self.id = id def add_library_preparation(self, library_preparation): self.library_preparations.append(library_preparation) @@ -88,7 +91,7 @@ class LibraryPreparation: def __init__(self, biomaterial_id, protocol_id, dissociation_protocol_id, differentiated_biomaterial_id, average_fragment_size, input_amount_value, input_amount_unit, final_yield_value, final_yield_unit, concentration_value, concentration_unit, - pcr_cycles, pcr_cycles_for_sample_index): + pcr_cycles, pcr_cycles_for_sample_index, id): self.biomaterial_id = biomaterial_id self.protocol_id = protocol_id self.dissociation_protocol_id = dissociation_protocol_id @@ -103,6 +106,7 @@ def __init__(self, biomaterial_id, protocol_id, dissociation_protocol_id, differ self.pcr_cycles = pcr_cycles self.pcr_cycles_for_sample_index = pcr_cycles_for_sample_index self.sequencing_files = [] + self.id = id def add_sequencing_file(self, sequencing_file): self.sequencing_files.append(sequencing_file) @@ -142,13 +146,14 @@ class EntityType: class SequencingFile: - def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, read_index, run_id): + def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, read_index, run_id, id): self.file_name = file_name self.library_preparation_id = library_preparation_id self.sequencing_protocol_id = sequencing_protocol_id self.read_index = read_index self.run_id = run_id self.entity_type = EntityType.FILE + self.id = id self.content = { "file_name": self.file_name, "library_preparation_id": self.library_preparation_id, @@ -248,7 +253,7 @@ def list_sheets(self): xls = pd.ExcelFile(self.file_path, engine='openpyxl') return xls.sheet_names - def parse_cell_lines(self, sheet_name): + def parse_cell_lines(self, sheet_name, action): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -266,7 +271,12 @@ def parse_cell_lines(self, sheet_name): - list of CellLine objects parsed from the specified sheet. - pd.DataFrame with the parsed data. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() # Remove unnamed columns (columns without headers) @@ -315,13 +325,14 @@ def parse_cell_lines(self, sheet_name): clone_id=row.get('cell_line.clone_id'), protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), zygosity=row.get('cell_line.zygosity'), - cell_type=cell_type + cell_type=cell_type, + id=row.get('Identifier'), ) ) return cell_lines, df_filtered - def parse_differentiated_cell_lines(self, sheet_name): + def parse_differentiated_cell_lines(self, sheet_name, action): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -337,7 +348,12 @@ def parse_differentiated_cell_lines(self, sheet_name): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() # Remove unnamed columns (columns without headers) @@ -371,14 +387,15 @@ def parse_differentiated_cell_lines(self, sheet_name): timepoint_value=row.get('differentiated_cell_line.timepoint_value'), timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), - model_system=row.get('differentiated_cell_line.model_organ.text') + model_system=row.get('differentiated_cell_line.model_organ.text'), + id=row.get('Id') ) for _, row in df_filtered.iterrows() ] return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name): + def parse_library_preparations(self, sheet_name, action): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -394,7 +411,12 @@ def parse_library_preparations(self, sheet_name): list A list of LibraryPreparation objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() # Remove unnamed columns (columns without headers) @@ -436,14 +458,15 @@ def parse_library_preparations(self, sheet_name): concentration_value=row.get('library_preparation.concentration_value'), concentration_unit=row.get('library_preparation.concentration_unit'), pcr_cycles=row.get('library_preparation.pcr_cycles'), - pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index') + pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index'), + id=row.get('Id') ) for _, row in df_filtered.iterrows() ] return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name): + def parse_sequencing_files(self, sheet_name, action): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -459,7 +482,12 @@ def parse_sequencing_files(self, sheet_name): list A list of SequencingFile objects parsed from the specified sheet. """ - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=3) + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() # Remove unnamed columns (columns without headers) @@ -491,14 +519,15 @@ def parse_sequencing_files(self, sheet_name): library_preparation_id=row.get('library_preparation.biomaterial_core.biomaterial_id'), sequencing_protocol_id=row.get('sequencing_protocol.protocol_core.protocol_id'), read_index=row.get('sequence_file.read_index'), - run_id=row.get('sequence_file.run_id') + run_id=row.get('sequence_file.run_id'), + id=row.get('Id') ) for _, row in df_filtered.iterrows() ] return sequencing_files, df_filtered - def get_cell_lines(self, sheet_name): + def get_cell_lines(self, sheet_name, action): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -514,10 +543,10 @@ def get_cell_lines(self, sheet_name): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name) + cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action) return cell_lines, cell_lines_df - def get_differentiated_cell_lines(self, sheet_name): + def get_differentiated_cell_lines(self, sheet_name, action): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -533,7 +562,9 @@ def get_differentiated_cell_lines(self, sheet_name): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name) + differentiated_cell_lines, differentiated_cell_lines_df = (self. + parse_differentiated_cell_lines + (sheet_name, action)) return differentiated_cell_lines, differentiated_cell_lines_df def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): @@ -635,7 +666,7 @@ def merge_library_preparation_sequencing_file(self, library_preparations, sequen if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: library_preparation.add_sequencing_file(sequencing_file) - def get_library_preparations(self, sheet_name): + def get_library_preparations(self, sheet_name, action): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -651,10 +682,10 @@ def get_library_preparations(self, sheet_name): list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations, df_filtered = self.parse_library_preparations(sheet_name) + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, action) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name): + def get_sequencing_files(self, sheet_name, action): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -670,5 +701,5 @@ def get_sequencing_files(self, sheet_name): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name) + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action) return sequencing_files, df_filtered From 812757f2c36f4597ef027a2c804f507a87e437aa Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 27 Jun 2024 19:36:30 +0100 Subject: [PATCH 19/55] delete action --- ait/commons/util/command/submit.py | 157 ++++++++++++++------- ait/commons/util/command/submit_file.py | 24 +++- ait/commons/util/util/provider_api_util.py | 49 +++++++ 3 files changed, 170 insertions(+), 60 deletions(-) create mode 100644 ait/commons/util/util/provider_api_util.py diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 6ab7946..60480cd 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -7,6 +7,7 @@ import numpy as np from urllib.parse import urlparse from ait.commons.util.user_profile import get_profile +from ait.commons.util.util.provider_api_util import APIProvider def get_id_from_url(url): @@ -76,6 +77,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.type = getattr(self.args, 'type', None) self.file = getattr(self.args, 'file', None) + self.provider_api = APIProvider(self.base_url) def run(self): """ @@ -86,7 +88,7 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, access_token, action): + def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dataset_id, access_token, action): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -103,7 +105,7 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, acc success = self.patchEntity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) if success: - print("Updated cell line: " + cell_line.id + " / " + cell_line.biomaterial_id) + print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") else: cell_line_entity_id_column_name = "Identifier" @@ -119,6 +121,10 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, acc access_token ) + print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") + + self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) + cell_lines_df.loc[ cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == cell_line.biomaterial_id, @@ -127,8 +133,9 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, acc return cell_line_entity_id - def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differentiated_cell_line, - differentiated_cell_lines_df, submission_envelope_id, access_token + def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, + differentiated_cell_lines_df, submission_envelope_id, dataset_id, + access_token , action): """ Handles a single differentiated cell line associated with a given cell line. @@ -144,7 +151,8 @@ def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differ - access_token: Access token for authentication and authorization. """ if action == 'modify' or action == 'MODIFY': - success = self.patchEntity('biomaterial', differentiated_cell_line.id, differentiated_cell_line.to_dict(), + success = self.patchEntity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), access_token) if success: @@ -153,13 +161,13 @@ def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differ else: print("Cell line has differentiated cell lines, creating differentiation process to link them") - differentiation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('differentiation'), - submission_envelope_id, access_token - ) + differentiation_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id) differentiated_biomaterial_to_entity_id_map = {} - differentiated_cell_line_entity_id_column_name = "Id" + differentiated_cell_line_entity_id_column_name = "Identifier" if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan @@ -187,6 +195,13 @@ def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differ access_token ) + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") + + self.link_to_dataset('biomaterial', dataset_id, + differentiated_entity_id, access_token) + print( f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " f"input to process : {differentiation_process_entity_id}") @@ -218,7 +233,8 @@ def handle_differentiated_cell_line(self, cell_line, cell_line_entity_id, differ return differentiated_entity_id def handle_library_preparation(self, differentiated_entity_id, library_preparation, - library_preparations_df, submission_envelope_id, access_token, action): + library_preparations_df, submission_envelope_id, + dataset_id, access_token, action): """ Handles a single library preparation associated with a given differentiated cell line. @@ -262,14 +278,21 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati access_token ) + print( + f"Linking Library Preparation Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") + + self.link_to_dataset('biomaterial', dataset_id, + differentiated_entity_id, access_token) + print( f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " f"as input to library preparation process") - library_preparation_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('library_preparation'), - submission_envelope_id, access_token - ) + library_preparation_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id) self.perform_hal_linkage( f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", @@ -294,8 +317,8 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati return library_preparation_entity_id - def handle_sequencing_file(self, library_preparation, library_preparation_entity_id, sequencing_file, - sequencing_file_df, submission_envelope_id, access_token, action): + def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, + sequencing_file_df, submission_envelope_id, dataset_id, access_token, action): """ Handles a single sequencing file associated with a given library preparation. @@ -318,11 +341,10 @@ def handle_sequencing_file(self, library_preparation, library_preparation_entity else: print("Creating sequencing process to link the sequencing file") - sequencing_process_entity_id = self.use_existing_envelope_and_submit_entity( - 'process', get_process_content('sequencing'), - submission_envelope_id, - access_token - ) + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) sequencing_file_entity_id_column_name = "Id" @@ -342,6 +364,13 @@ def handle_sequencing_file(self, library_preparation, library_preparation_entity print(f"Created Sequencing file: {sequencing_file_entity_id}") + print( + f"Linking sequencing file: {sequencing_file_entity_id} " + f"to dataset: {dataset_id}") + + self.link_to_dataset('file', dataset_id, + sequencing_file_entity_id, access_token) + print( f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " f"as input to process: {sequencing_process_entity_id}") @@ -366,11 +395,27 @@ def handle_sequencing_file(self, library_preparation, library_preparation_entity sequencing_file_entity_id_column_name ] = sequencing_file_entity_id + def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): + process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', + process_data, + submission_envelope_id, + access_token + ) + + print( + f"Linking process: {process_entity_id} " + f"to dataset: {dataset_id}") + self.link_to_dataset('process', dataset_id, process_entity_id, access_token) + + return process_entity_id + def multi_type_submission(self, cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df, submission_envelope_id, + dataset_id, access_token, action): """ @@ -396,14 +441,16 @@ def multi_type_submission(self, cell_lines, cell_lines_df, cell_line_entity_id = self.handle_cell_line(cell_line, cell_lines_df, submission_envelope_id, + dataset_id, access_token, action) for differentiated_cell_line in cell_line.differentiated_cell_lines: - differentiated_entity_id = self.handle_differentiated_cell_line(cell_line, cell_line_entity_id, + differentiated_entity_id = self.handle_differentiated_cell_line(cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, submission_envelope_id, + dataset_id, access_token, action) @@ -412,15 +459,16 @@ def multi_type_submission(self, cell_lines, cell_lines_df, library_preparation, library_preparations_df, submission_envelope_id, + dataset_id, access_token, action) for sequencing_file in library_preparation.sequencing_files: - self.handle_sequencing_file(library_preparation, - library_preparation_entity_id, + self.handle_sequencing_file(library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, + dataset_id, access_token, action) @@ -543,6 +591,19 @@ def patchEntity(self, input_entity_type, id, data, access_token): return self.patch_to_provider_api(entity_patch_url, data, access_token) + def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token): + if input_entity_type == 'biomaterial': + halEntity = 'biomaterials' + elif input_entity_type == 'process': + halEntity = 'processes' + elif input_entity_type == 'file': + halEntity = 'files' + + # put_url = self.base_url + '/' + 'datasets' + '/' + dataset_id + '/' + halEntity + '/' + entity_id + + # TODO: log here + # return self.provider_api.put_to_provider_api(put_url, access_token) + def patch_to_provider_api(self, entity_patch_url, data, access_token): headers = { 'Content-Type': 'application/json', @@ -603,8 +664,9 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): """ print("Linking dataset " + dataset_id + " to study " + study_id) - self.put_to_provider_api(f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", - access_token) + self.provider_api.put_to_provider_api( + f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", + access_token) print("Dataset linked successfully to study: " + study_id) @@ -619,9 +681,10 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ print("Linking biomaterial " + biomaterial_id + " to dataset " + dataset_id) - self.put_to_provider_api(f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}", access_token) + self.provider_api.put_to_provider_api(f"{self.base_url}/datasets/" + f"{dataset_id}/biomaterials/{biomaterial_id}", access_token) - print("Biosmaterial linked successfully to dataset: " + dataset_id) + print("Biomaterial linked successfully to dataset: " + dataset_id) def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): """ @@ -787,27 +850,6 @@ def transform(self, file): data = json.load(file) return data - def put_to_provider_api(self, url, access_token): - """ - Sends a PUT request to the specified URL. - - Parameters: - url (str): The URL to send the request to. - access_token (str): Access token for authorization. - - Returns: - dict: The response data. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {access_token}' - } - - response = requests.put(url, headers=headers) - response_data = response.json() - - return response_data - def create_child_biomaterial(self, cell_line_entity_id, body, access_token): url = self.base_url + '/' + 'biomaterials' + '/' + cell_line_entity_id + '/' + 'childBiomaterials' @@ -819,8 +861,17 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces global url if type == 'biomaterial': - url = self.submission_envelope_base_url + '/' + submission_envelope_id + '/' + 'biomaterials' + '/' + entity_id + url = (self.submission_envelope_base_url + '/' + submission_envelope_id + + '/' + 'biomaterials' + '/' + entity_id) elif type == 'file': - url = self.submission_envelope_base_url + '/' + submission_envelope_id + '/' + 'files' + '/' + entity_id + url = (self.submission_envelope_base_url + '/' + submission_envelope_id + + '/' + 'files' + '/' + entity_id) + + # self.put_to_provider_api(url, access_token) + self.provider_api.put_to_provider_api(url, access_token) + + def delete_dataset(self, dataset, access_token): + # delete_to_provider_api(self.base_url + '/' + dataset, access_token) - self.put_to_provider_api(url, access_token) + self.provider_api.delete_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, + access_token, True) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 68594e0..07cac12 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -6,6 +6,7 @@ from ait.commons.util.command.list import CmdList from ait.commons.util.command.submit import CmdSubmit, get_id_from_url from ait.commons.util.user_profile import get_profile +from ait.commons.util.util.provider_api_util import APIProvider from ait.commons.util.util.spreadsheet_util import SpreadsheetSubmitter @@ -40,11 +41,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.user_profile = get_profile('morphic-util') self.aws = Aws(self.user_profile) - - if hasattr(self.args, 'file') and self.args.file is not None: - self.file = self.args.file - else: - raise Exception("File is mandatory") + self.provider_api = APIProvider(self.base_url) if hasattr(self.args, 'action') and self.args.action is not None: self.action = self.args.action @@ -59,11 +56,23 @@ def __init__(self, args): "dataset using the same option and link your dataset to your study" "before proceeding with this submission.") + if hasattr(self.args, 'file') and self.args.file is not None: + self.file = self.args.file + else: + if self.action != 'DELETE': + raise Exception("File is mandatory") + else: + print("Deleting dataset " + self.dataset) + def run(self): """ Execute the command file submission process. """ submission_instance = CmdSubmit(self) + + if self.action == 'delete' or self.action == 'DELETE': + submission_instance.delete_dataset(self.dataset, self.access_token) + list_instance = CmdList(self.aws, self.args) list_of_files_in_upload_area = (list_instance. @@ -85,11 +94,11 @@ def run(self): library_preparations) sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', self.action) - validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) + # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) - if self.action != 'modify' and self.action != 'MODIFY': + if self.action == 'add' or self.action == 'ADD': submission_envelope_response = submission_instance.create_new_submission_envelope( self.submission_envelope_create_url, access_token=self.access_token) @@ -111,6 +120,7 @@ def run(self): library_preparations_df, sequencing_files_df, submission_envelope_id, + self.dataset, self.access_token, self.action ) diff --git a/ait/commons/util/util/provider_api_util.py b/ait/commons/util/util/provider_api_util.py new file mode 100644 index 0000000..7304708 --- /dev/null +++ b/ait/commons/util/util/provider_api_util.py @@ -0,0 +1,49 @@ +import requests + + +class APIProvider: + def __init__(self, base_url): + self.base_url = base_url + + def send_request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): + """ + Sends an HTTP request to the specified URL with the given method. + + Parameters: + method (str): The HTTP method (GET, POST, PUT, DELETE). + url (str): The URL to send the request to. + access_token (str): Access token for authorization. + params (dict, optional): The URL parameters to be sent with the request. + data (dict, optional): The data to be sent in the request body. + data_type_in_hal_link (str, optional): The data type in the HAL link for extracting URL in POST response. + + Returns: + dict or str: The response data for PUT/DELETE or the URL for POST requests. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.request(method, url, headers=headers, params=params, json=data) + + # Check for successful response + if not response.ok: + response.raise_for_status() + + response_data = response.json() + + if method == 'POST' and data_type_in_hal_link: + return response_data['_links'][data_type_in_hal_link]['href'] + return response_data + + def put_to_provider_api(self, url, access_token): + return self.send_request('PUT', url, access_token) + + # TODO: have a generic delete and also a delete with params + def delete_to_provider_api(self, url, access_token, delete_linked_entities=False): + params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} + return self.send_request('DELETE', url, access_token, params=params) + + def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): + return self.send_request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) From 5de275fc2cc10e55cd730b065fd1ca2706d06114 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 2 Jul 2024 14:27:53 +0100 Subject: [PATCH 20/55] code improvements, TODOs --- ait/commons/util/command/submit.py | 129 ++++++------ ait/commons/util/command/submit_file.py | 64 +++++- ait/commons/util/util/provider_api_util.py | 2 +- ait/commons/util/util/spreadsheet_util.py | 219 ++++++++++++++------- 4 files changed, 261 insertions(+), 153 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 60480cd..26e87a1 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -135,8 +135,7 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, submission_envelope_id, dataset_id, - access_token - , action): + access_token, action): """ Handles a single differentiated cell line associated with a given cell line. @@ -150,14 +149,14 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - if action == 'modify' or action == 'MODIFY': + if action.lower() == 'modify': success = self.patchEntity('biomaterial', differentiated_cell_line.id, differentiated_cell_line.to_dict(), access_token) if success: - print("Updated differentiated cell line: " + differentiated_cell_line.id + " / " + - differentiated_cell_line.biomaterial_id) + print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") else: print("Cell line has differentiated cell lines, creating differentiation process to link them") @@ -172,9 +171,8 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan - print( - f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " - f"as a child of Cell line: {cell_line_entity_id}") + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") differentiated_entity_id = self.create_child_biomaterial( cell_line_entity_id, @@ -184,9 +182,8 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to envelope: {submission_envelope_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") self.link_entity_to_envelope( 'biomaterial', @@ -195,25 +192,22 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce access_token ) - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to dataset: {dataset_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") self.link_to_dataset('biomaterial', dataset_id, differentiated_entity_id, access_token) - print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " - f"input to process : {differentiation_process_entity_id}") + print(f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " + f"input to process : {differentiation_process_entity_id}") self.perform_hal_linkage( f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", differentiation_process_entity_id, 'processes', access_token ) - print( - f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") self.perform_hal_linkage( f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", @@ -224,8 +218,7 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce differentiated_cell_line.biomaterial_id] = differentiated_entity_id differentiated_cell_lines_df.loc[ - differentiated_cell_lines_df[ - 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == + differentiated_cell_lines_df['differentiated_cell_line.biomaterial_core.biomaterial_id'] == differentiated_cell_line.biomaterial_id, differentiated_cell_line_entity_id_column_name ] = differentiated_entity_id @@ -247,14 +240,14 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - if action == 'modify' or action == 'MODIFY': + if action.lower() == 'modify': success = self.patchEntity('biomaterial', library_preparation.id, library_preparation.to_dict(), access_token) if success: - print("Updated library preparation biomaterial: " + library_preparation.id + " / " + - library_preparation.biomaterial_id) + print(f"Updated library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") else: print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " f"{differentiated_entity_id}") @@ -267,9 +260,8 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to envelope: {submission_envelope_id}") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"to envelope: {submission_envelope_id}") self.link_entity_to_envelope( 'biomaterial', @@ -278,30 +270,29 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati access_token ) - print( - f"Linking Library Preparation Biomaterial: {differentiated_entity_id} " - f"to dataset: {dataset_id}") + print(f"Linking Library Preparation Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") self.link_to_dataset('biomaterial', dataset_id, differentiated_entity_id, access_token) - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"as input to library preparation process") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"as input to library preparation process") library_preparation_process_entity_id = self.create_process(access_token, dataset_id, get_process_content('library_preparation'), submission_envelope_id) + library_preparation_entity_id_column_name = "Identifier" + self.perform_hal_linkage( f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", library_preparation_process_entity_id, 'processes', access_token ) - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"as derived by library preparation process") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"as derived by library preparation process") self.perform_hal_linkage( f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", @@ -309,10 +300,9 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati ) library_preparations_df.loc[ - library_preparations_df[ - 'library_preparation.biomaterial_core.biomaterial_id'] == + library_preparations_df['library_preparation.biomaterial_core.biomaterial_id'] == library_preparation.biomaterial_id, - 'Id' + library_preparation_entity_id_column_name, ] = library_preparation_entity_id return library_preparation_entity_id @@ -330,14 +320,13 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - submission_envelope_id: ID of the submission envelope where entities will be linked. - access_token: Access token for authentication and authorization. """ - if action == 'modify' or action == 'MODIFY': + if action.lower() == 'modify': success = self.patchEntity('file', sequencing_file.id, sequencing_file.to_dict(), access_token) if success: - print("Updated sequencing file: " + sequencing_file.id + " / " + - sequencing_file.file_name) + print(f"Updated sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") else: print("Creating sequencing process to link the sequencing file") @@ -346,14 +335,13 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, get_process_content('sequencing'), submission_envelope_id) - sequencing_file_entity_id_column_name = "Id" + sequencing_file_entity_id_column_name = "Identifier" if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan - print( - f"Creating Sequencing file: {sequencing_file.file_name} " - f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") + print(f"Creating Sequencing file: {sequencing_file.file_name} " + f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( 'file', @@ -364,25 +352,21 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, print(f"Created Sequencing file: {sequencing_file_entity_id}") - print( - f"Linking sequencing file: {sequencing_file_entity_id} " - f"to dataset: {dataset_id}") + print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") self.link_to_dataset('file', dataset_id, sequencing_file_entity_id, access_token) - print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " - f"as input to process: {sequencing_process_entity_id}") + print(f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " + f"as input to process: {sequencing_process_entity_id}") self.perform_hal_linkage( f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", sequencing_process_entity_id, 'processes', access_token ) - print( - f"Linking Sequencing file: {sequencing_file_entity_id} " - f"as derived by process: {sequencing_process_entity_id}") + print(f"Linking Sequencing file: {sequencing_file_entity_id} " + f"as derived by process: {sequencing_process_entity_id}") self.perform_hal_linkage( f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", @@ -390,8 +374,7 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, ) sequencing_file_df.loc[ - sequencing_file_df[ - 'sequence_file.file_core.file_name'] == sequencing_file.file_name, + sequencing_file_df['sequence_file.file_core.file_name'] == sequencing_file.file_name, sequencing_file_entity_id_column_name ] = sequencing_file_entity_id @@ -571,7 +554,7 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) - print(f"{input_entity_type.capitalize()} created successfully: " + entity_id) + print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") return entity_id @@ -599,10 +582,9 @@ def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token elif input_entity_type == 'file': halEntity = 'files' - # put_url = self.base_url + '/' + 'datasets' + '/' + dataset_id + '/' + halEntity + '/' + entity_id + put_url = self.base_url + '/' + 'datasets' + '/' + dataset_id + '/' + halEntity + '/' + entity_id - # TODO: log here - # return self.provider_api.put_to_provider_api(put_url, access_token) + return self.provider_api.put_to_provider_api(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): headers = { @@ -649,7 +631,7 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) - print(f"{input_entity_type.capitalize()} created successfully: " + entity_id) + print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") return entity_id @@ -662,13 +644,13 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): study_id (str): The ID of the study. access_token (str): Access token for authorization. """ - print("Linking dataset " + dataset_id + " to study " + study_id) + print(f"Linking dataset {dataset_id} to study {study_id}") self.provider_api.put_to_provider_api( f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", access_token) - print("Dataset linked successfully to study: " + study_id) + print(f"Dataset linked successfully to study: {study_id}") def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ @@ -679,12 +661,12 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): dataset_id (str): The ID of the dataset. access_token (str): Access token for authorization. """ - print("Linking biomaterial " + biomaterial_id + " to dataset " + dataset_id) + print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") - self.provider_api.put_to_provider_api(f"{self.base_url}/datasets/" - f"{dataset_id}/biomaterials/{biomaterial_id}", access_token) + self.provider_api.put_to_provider_api( + f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}", access_token) - print("Biomaterial linked successfully to dataset: " + dataset_id) + print(f"Biomaterial linked successfully to dataset: {dataset_id}") def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): """ @@ -695,10 +677,11 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): process_id (str): The ID of the process. access_token (str): Access token for authorization. """ - print("Linking biomaterial " + biomaterial_id + " to process " + process_id) + print(f"Linking biomaterial {biomaterial_id} to process {process_id}") - self.perform_hal_linkage(f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", - process_id, 'processes', access_token) + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", + process_id, 'processes', access_token) def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): """ @@ -875,3 +858,5 @@ def delete_dataset(self, dataset, access_token): self.provider_api.delete_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, access_token, True) + + print(f"Dataset deleted successfully {dataset}") diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 07cac12..35a1ef4 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -26,6 +26,59 @@ def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, da class CmdSubmitFile: + # Column mappings for parsing different sections of the spreadsheet + cellline_column_mapping = { + "CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", + "CELL LINE DESCRIPTION": "cell_line.biomaterial_core.biomaterial_description", + "DERIVED FROM CELL LINE NAME (Required)": "cell_line.derived_cell_line_accession", + "CLONE ID": "cell_line.clone_id", + "GENE EXPRESSION ALTERATION PROTOCOL ID": "gene_expression_alteration_protocol.protocol_core.protocol_id", + "ZYGOSITY": "cell_line.zygosity", + "CELL LINE TYPE (Required)": "cell_line.type", + "Unnamed: 7": None, + "Unnamed: 8": None + } + + differentiated_cellline_column_mapping = { + "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", + "DIFFERENTIATED CELL LINE DESCRIPTION": "differentiated_cell_line.biomaterial_core.biomaterial_description", + "INPUT CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", + "DIFFERENTIATION PROTOCOL ID (Required)": "differentiation_protocol.protocol_core.protocol_id", + "TIMEPOINT VALUE": "differentiated_cell_line.timepoint_value", + "TIMEPOINT UNIT": "differentiated_cell_line.timepoint_unit.text", + "TERMINALLY DIFFERENTIATED": "differentiated_cell_line.terminally_differentiated", + "FINAL LINEAGE STAGE": "differentiated_cell_line.terminally_differentiated", + "Model System": "cell_line.model_organ.text", + "MODEL SYSTEM": "cell_line.model_organ.text", + "Unnamed: 8": None + } + + library_preparation_column_mapping = { + "LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", + "LIBRARY PREPARATION PROTOCOL ID (Required)": "library_preparation_protocol.protocol_core.protocol_id", + "DISSOCIATION PROTOCOL ID (Required)": "dissociation_protocol.protocol_core.protocol_id", + "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", + "LIBRARY AVERAGE FRAGMENT SIZE": "library_preparation.average_fragment_size", + "LIBRARY INPUT AMOUNT VALUE": "library_preparation.input_amount_value", + "LIBRARY INPUT AMOUNT UNIT": "library_preparation.input_amount_unit", + "LIBRARY FINAL YIELD VALUE": "library_preparation.final_yield_value", + "LIBRARY FINAL YIELD UNIT": "library_preparation.final_yield_unit", + "LIBRARY CONCENTRATION VALUE": "library_preparation.concentration_value", + "LIBRARY CONCENTRATION UNIT": "library_preparation.concentration_unit", + "LIBRARY PCR CYCLES": "library_preparation.pcr_cycles", + "LIBRARY PCR CYCLES FOR SAMPLE INDEX": "library_preparation.pcr_cycles_for_sample_index", + "Unnamed: 14": None # Adjust index based on your actual column count + } + + sequencing_file_column_mapping = { + "FILE NAME (Required)": "sequence_file.file_core.file_name", + "INPUT LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", + "SEQUENCING PROTOCOL ID (Required)": "sequencing_protocol.protocol_core.protocol_id", + "READ INDEX (Required)": "sequence_file.read_index", + "RUN ID": "sequence_file.run_id", + "Unnamed: 5": None # Adjust index based on your actual column count + } + base_url = 'http://localhost:8080' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" @@ -83,16 +136,19 @@ def run(self): parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action) + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, + self.cellline_column_mapping) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action) + 'Differentiated cell line', self.action, self.differentiated_cellline_column_mapping) parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) library_preparations, library_preparations_df = (parser .get_library_preparations('Library preparation', - self.action)) + self.action, + self.library_preparation_column_mapping)) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) - sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', self.action) + sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', self.action, + self.sequencing_file_column_mapping) # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) diff --git a/ait/commons/util/util/provider_api_util.py b/ait/commons/util/util/provider_api_util.py index 7304708..14c47c5 100644 --- a/ait/commons/util/util/provider_api_util.py +++ b/ait/commons/util/util/provider_api_util.py @@ -43,7 +43,7 @@ def put_to_provider_api(self, url, access_token): # TODO: have a generic delete and also a delete with params def delete_to_provider_api(self, url, access_token, delete_linked_entities=False): params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} - return self.send_request('DELETE', url, access_token, params=params) + self.send_request('DELETE', url, access_token, params=params) def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): return self.send_request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/util/spreadsheet_util.py index 0f74705..cba4e3f 100644 --- a/ait/commons/util/util/spreadsheet_util.py +++ b/ait/commons/util/util/spreadsheet_util.py @@ -253,7 +253,7 @@ def list_sheets(self): xls = pd.ExcelFile(self.file_path, engine='openpyxl') return xls.sheet_names - def parse_cell_lines(self, sheet_name, action): + def parse_cell_lines(self, sheet_name, action, column_mapping): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -274,13 +274,14 @@ def parse_cell_lines(self, sheet_name, action): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 3 + skip_rows = 0 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) - df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: @@ -289,6 +290,8 @@ def parse_cell_lines(self, sheet_name, action): # Filter rows where biomaterial_id is not null df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] @@ -301,7 +304,6 @@ def parse_cell_lines(self, sheet_name, action): df_filtered = df[mask] # Check for mandatory fields and create CellLine objects - # TODO: for all cell_lines = [] for _, row in df_filtered.iterrows(): biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] @@ -326,13 +328,13 @@ def parse_cell_lines(self, sheet_name, action): protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), zygosity=row.get('cell_line.zygosity'), cell_type=cell_type, - id=row.get('Identifier'), + id=row.get('Identifier') ) ) return cell_lines, df_filtered - def parse_differentiated_cell_lines(self, sheet_name, action): + def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -351,13 +353,14 @@ def parse_differentiated_cell_lines(self, sheet_name, action): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 3 + skip_rows = 0 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) - df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: @@ -366,6 +369,8 @@ def parse_differentiated_cell_lines(self, sheet_name, action): # Filter rows where biomaterial_id is not null df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] @@ -377,25 +382,39 @@ def parse_differentiated_cell_lines(self, sheet_name, action): # Apply the mask to filter out rows df_filtered = df[mask] - # Create DifferentiatedCellLine objects from filtered DataFrame rows - differentiated_cell_lines = [ - DifferentiatedCellLine( - biomaterial_id=row['differentiated_cell_line.biomaterial_core.biomaterial_id'], - description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), - input_biomaterial_id=row.get('cell_line.biomaterial_core.biomaterial_id'), - protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), - timepoint_value=row.get('differentiated_cell_line.timepoint_value'), - timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), - terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), - model_system=row.get('differentiated_cell_line.model_organ.text'), - id=row.get('Id') + # Check for mandatory fields and create Differentiated CellLine objects + differentiated_cell_lines = [] + for _, row in df_filtered.iterrows(): + differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] + biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') + + # Check if biomaterial_id is null + if pd.isnull(differentiated_biomaterial_id): + raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null.") + + # Check if derived_accession and cell_type are present + if pd.isnull(biomaterial_id): + raise MissingMandatoryFieldError( + "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + differentiated_cell_lines.append( + DifferentiatedCellLine( + biomaterial_id=differentiated_biomaterial_id, + description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), + input_biomaterial_id=biomaterial_id, + protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), + timepoint_value=row.get('differentiated_cell_line.timepoint_value'), + timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), + terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), + model_system=row.get('differentiated_cell_line.model_organ.text'), + id=row.get('Identifier') + ) ) - for _, row in df_filtered.iterrows() - ] return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, action): + def parse_library_preparations(self, sheet_name, action, column_mapping): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -403,8 +422,6 @@ def parse_library_preparations(self, sheet_name, action): ----------- sheet_name : str The name of the sheet containing library preparation data. - column_mapping : dict - A dictionary mapping column names in the sheet to expected attribute names. Returns: -------- @@ -414,22 +431,29 @@ def parse_library_preparations(self, sheet_name, action): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 3 + skip_rows = 0 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) - df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists - if 'library_preparation.biomaterial_core.biomaterial_id' not in df.columns: - raise KeyError("The column 'library_preparation.biomaterial_core.biomaterial_id' " - "does not exist.") + required_columns = [ + 'library_preparation.biomaterial_core.biomaterial_id', + 'dissociation_protocol.protocol_core.protocol_id', + 'differentiated_cell_line.biomaterial_core.biomaterial_id', + 'library_preparation_protocol.protocol_core.protocol_id' + ] + for col in required_columns: + if col not in df.columns: + raise KeyError(f"The column '{col}' does not exist.") # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] - # TODO: for all + df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' @@ -443,30 +467,47 @@ def parse_library_preparations(self, sheet_name, action): # Apply the mask to filter out rows df_filtered = df[mask] - # Create LibraryPreparation objects from filtered DataFrame rows - library_preparations = [ - LibraryPreparation( - biomaterial_id=row['library_preparation.biomaterial_core.biomaterial_id'], - protocol_id=row.get('library_preparation_protocol.protocol_core.protocol_id'), - dissociation_protocol_id=row.get('dissociation_protocol.protocol_core.protocol_id'), - differentiated_biomaterial_id=row.get('differentiated_cell_line.biomaterial_core.biomaterial_id'), - average_fragment_size=row.get('library_preparation.average_fragment_size'), - input_amount_value=row.get('library_preparation.input_amount_value'), - input_amount_unit=row.get('library_preparation.input_amount_unit'), - final_yield_value=row.get('library_preparation.final_yield_value'), - final_yield_unit=row.get('library_preparation.final_yield_unit'), - concentration_value=row.get('library_preparation.concentration_value'), - concentration_unit=row.get('library_preparation.concentration_unit'), - pcr_cycles=row.get('library_preparation.pcr_cycles'), - pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index'), - id=row.get('Id') + # Check for mandatory fields and create Library Preparation objects + library_preparations = [] + for _, row in df_filtered.iterrows(): + library_preparation_id = row['library_preparation.biomaterial_core.biomaterial_id'] + dissociation_protocol_id = row.get('dissociation_protocol.protocol_core.protocol_id') + differentiated_biomaterial_id = row.get('differentiated_cell_line.biomaterial_core.biomaterial_id') + library_preparation_protocol_id = row.get('library_preparation_protocol.protocol_core.protocol_id') + + # Check if required fields are null + if pd.isnull(library_preparation_id): + raise MissingMandatoryFieldError("Library Preparation ID cannot be null.") + if pd.isnull(dissociation_protocol_id): + raise MissingMandatoryFieldError("Dissociation Protocol ID cannot be null.") + if pd.isnull(differentiated_biomaterial_id): + raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null.") + if pd.isnull(library_preparation_protocol_id): + raise MissingMandatoryFieldError("Library Preparation Protocol ID cannot be null.") + + # Create LibraryPreparation objects from filtered DataFrame rows + library_preparations.append( + LibraryPreparation( + biomaterial_id=library_preparation_id, + protocol_id=library_preparation_protocol_id, + dissociation_protocol_id=dissociation_protocol_id, + differentiated_biomaterial_id=differentiated_biomaterial_id, + average_fragment_size=row.get('library_preparation.average_fragment_size'), + input_amount_value=row.get('library_preparation.input_amount_value'), + input_amount_unit=row.get('library_preparation.input_amount_unit'), + final_yield_value=row.get('library_preparation.final_yield_value'), + final_yield_unit=row.get('library_preparation.final_yield_unit'), + concentration_value=row.get('library_preparation.concentration_value'), + concentration_unit=row.get('library_preparation.concentration_unit'), + pcr_cycles=row.get('library_preparation.pcr_cycles'), + pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index'), + id=row.get('Identifier') + ) ) - for _, row in df_filtered.iterrows() - ] return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, action): + def parse_sequencing_files(self, sheet_name, action, column_mapping): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -474,8 +515,6 @@ def parse_sequencing_files(self, sheet_name, action): ----------- sheet_name : str The name of the sheet containing sequencing file data. - column_mapping : dict - A dictionary mapping column names in the sheet to expected attribute names. Returns: -------- @@ -485,21 +524,31 @@ def parse_sequencing_files(self, sheet_name, action): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 3 + skip_rows = 0 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() + df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) - df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists - if 'sequence_file.file_core.file_name' not in df.columns: - raise KeyError("The column 'sequence_file.file_core.file_name' does not exist.") + required_columns = [ + 'sequence_file.file_core.file_name', + 'library_preparation.biomaterial_core.biomaterial_id', + 'sequencing_protocol.protocol_core.protocol_id', + 'sequence_file.read_index' + ] + for col in required_columns: + if col not in df.columns: + raise KeyError(f"The column '{col}' does not exist.") # Filter rows where file_name is not null df = df[df['sequence_file.file_core.file_name'].notna()] + df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['sequence_file.file_core.file_name'] @@ -512,22 +561,39 @@ def parse_sequencing_files(self, sheet_name, action): # Apply the mask to filter out rows df_filtered = df[mask] - # Create SequencingFile objects from filtered DataFrame rows - sequencing_files = [ - SequencingFile( - file_name=row['sequence_file.file_core.file_name'], - library_preparation_id=row.get('library_preparation.biomaterial_core.biomaterial_id'), - sequencing_protocol_id=row.get('sequencing_protocol.protocol_core.protocol_id'), - read_index=row.get('sequence_file.read_index'), - run_id=row.get('sequence_file.run_id'), - id=row.get('Id') + # Check for mandatory fields and create Sequencing file objects + sequencing_files = [] + for _, row in df_filtered.iterrows(): + file_name = row['sequence_file.file_core.file_name'] + library_preparation_id = row.get('library_preparation.biomaterial_core.biomaterial_id') + sequencing_protocol_id = row.get('sequencing_protocol.protocol_core.protocol_id') + read_index = row.get('sequence_file.read_index') + + # Check if required fields are null + if pd.isnull(file_name): + raise MissingMandatoryFieldError("Sequence file name cannot be null.") + if pd.isnull(library_preparation_id): + raise MissingMandatoryFieldError("Library Preparation ID cannot be null.") + if pd.isnull(sequencing_protocol_id): + raise MissingMandatoryFieldError("Sequencing Protocol ID cannot be null.") + if pd.isnull(read_index): + raise MissingMandatoryFieldError("Read Index cannot be null.") + + # Create SequencingFile objects from filtered DataFrame rows + sequencing_files.append( + SequencingFile( + file_name=file_name, + library_preparation_id=library_preparation_id, + sequencing_protocol_id=sequencing_protocol_id, + read_index=read_index, + run_id=row.get('sequence_file.run_id'), + id=row.get('Identifier') + ) ) - for _, row in df_filtered.iterrows() - ] return sequencing_files, df_filtered - def get_cell_lines(self, sheet_name, action): + def get_cell_lines(self, sheet_name, action, column_mapping): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -543,10 +609,10 @@ def get_cell_lines(self, sheet_name, action): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action) + cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action, column_mapping) return cell_lines, cell_lines_df - def get_differentiated_cell_lines(self, sheet_name, action): + def get_differentiated_cell_lines(self, sheet_name, action, column_mapping): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -564,7 +630,8 @@ def get_differentiated_cell_lines(self, sheet_name, action): """ differentiated_cell_lines, differentiated_cell_lines_df = (self. parse_differentiated_cell_lines - (sheet_name, action)) + (sheet_name, action, + column_mapping)) return differentiated_cell_lines, differentiated_cell_lines_df def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): @@ -666,7 +733,7 @@ def merge_library_preparation_sequencing_file(self, library_preparations, sequen if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: library_preparation.add_sequencing_file(sequencing_file) - def get_library_preparations(self, sheet_name, action): + def get_library_preparations(self, sheet_name, action, column_mapping): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -682,10 +749,10 @@ def get_library_preparations(self, sheet_name, action): list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations, df_filtered = self.parse_library_preparations(sheet_name, action) + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, action, column_mapping) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, action): + def get_sequencing_files(self, sheet_name, action, column_mapping): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -701,5 +768,5 @@ def get_sequencing_files(self, sheet_name, action): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action) + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, column_mapping) return sequencing_files, df_filtered From ea5c4f4cc7024aab5aa64ef753cd18819ce9d7a8 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 2 Jul 2024 15:19:17 +0100 Subject: [PATCH 21/55] pre-deployment --- ait/commons/util/command/submit.py | 4 ++-- ait/commons/util/command/submit_file.py | 6 +++--- ait/commons/util/{util => }/provider_api_util.py | 0 ait/commons/util/settings/morphic_util.py | 6 +++--- ait/commons/util/{util => }/spreadsheet_util.py | 0 ait/commons/util/util/__init__.py | 0 requirements.txt | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) rename ait/commons/util/{util => }/provider_api_util.py (100%) rename ait/commons/util/{util => }/spreadsheet_util.py (100%) delete mode 100644 ait/commons/util/util/__init__.py diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 26e87a1..ae657fb 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -7,7 +7,7 @@ import numpy as np from urllib.parse import urlparse from ait.commons.util.user_profile import get_profile -from ait.commons.util.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import APIProvider def get_id_from_url(url): @@ -62,7 +62,7 @@ class CmdSubmit: transform(file): Transforms the input file to a JSON object. put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ - base_url = 'http://localhost:8080' + base_url = 'https://api.ingest.dev.archive.morphic.bio' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 35a1ef4..af35c2e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -6,8 +6,8 @@ from ait.commons.util.command.list import CmdList from ait.commons.util.command.submit import CmdSubmit, get_id_from_url from ait.commons.util.user_profile import get_profile -from ait.commons.util.util.provider_api_util import APIProvider -from ait.commons.util.util.spreadsheet_util import SpreadsheetSubmitter +from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter # Define a class for handling submission of a command file @@ -79,7 +79,7 @@ class CmdSubmitFile: "Unnamed: 5": None # Adjust index based on your actual column count } - base_url = 'http://localhost:8080' + base_url = 'https://api.ingest.dev.archive.morphic.bio/' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" diff --git a/ait/commons/util/util/provider_api_util.py b/ait/commons/util/provider_api_util.py similarity index 100% rename from ait/commons/util/util/provider_api_util.py rename to ait/commons/util/provider_api_util.py diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 07a226a..82368a8 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,9 +1,9 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.10' -DESC = 'CLI tool for uploading data to Morphic AWS S3 bucket' -AUTHOR = 'morphic-bio-dev' +VERSION = '0.0.13' +DESC = 'CLI tool for submitting your analysis data and metadata' +AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' # when true, displays exception details; otherwise user-friendly error message diff --git a/ait/commons/util/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py similarity index 100% rename from ait/commons/util/util/spreadsheet_util.py rename to ait/commons/util/spreadsheet_util.py diff --git a/ait/commons/util/util/__init__.py b/ait/commons/util/util/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/requirements.txt b/requirements.txt index 0128bf0..82b6911 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ botocore>=1.26.10 filetype==1.0.7 requests>=2.20.0, <3 urllib3<1.27, >=1.25.4 -tqdm~=4.64.1 -pandas~=1.1.5 +tqdm~=4.57.0 +pandas~=2.2.2 setuptools~=59.6.0 openpyxl==3.1.3 \ No newline at end of file From 190f5109bc762deb7d55b1e5ecb640aeb71cc657 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 2 Jul 2024 16:11:39 +0100 Subject: [PATCH 22/55] pre-deployment warnings fix --- ait/commons/util/command/submit.py | 22 +++++++++++++++------- ait/commons/util/settings/morphic_util.py | 2 +- ait/commons/util/spreadsheet_util.py | 8 ++++---- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index ae657fb..66bf56a 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -125,9 +125,11 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) + cell_lines_df[cell_line_entity_id_column_name] = (cell_lines_df[cell_line_entity_id_column_name] + .astype(object)) + cell_lines_df.loc[ - cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == - cell_line.biomaterial_id, + cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == cell_line.biomaterial_id, cell_line_entity_id_column_name ] = cell_line_entity_id @@ -217,9 +219,12 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce differentiated_biomaterial_to_entity_id_map[ differentiated_cell_line.biomaterial_id] = differentiated_entity_id + differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = differentiated_cell_lines_df[ + differentiated_cell_line_entity_id_column_name].astype(object) + differentiated_cell_lines_df.loc[ - differentiated_cell_lines_df['differentiated_cell_line.biomaterial_core.biomaterial_id'] == - differentiated_cell_line.biomaterial_id, + differentiated_cell_lines_df[ + 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == differentiated_cell_line.biomaterial_id, differentiated_cell_line_entity_id_column_name ] = differentiated_entity_id @@ -300,9 +305,9 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati ) library_preparations_df.loc[ - library_preparations_df['library_preparation.biomaterial_core.biomaterial_id'] == - library_preparation.biomaterial_id, - library_preparation_entity_id_column_name, + library_preparations_df[ + 'library_preparation.biomaterial_core.biomaterial_id'] == library_preparation.biomaterial_id, + library_preparation_entity_id_column_name ] = library_preparation_entity_id return library_preparation_entity_id @@ -373,6 +378,9 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_process_entity_id, 'processes', access_token ) + sequencing_file_df[sequencing_file_entity_id_column_name] = sequencing_file_df[ + sequencing_file_entity_id_column_name].astype(object) + sequencing_file_df.loc[ sequencing_file_df['sequence_file.file_core.file_name'] == sequencing_file.file_name, sequencing_file_entity_id_column_name diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 82368a8..2b9d695 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.13' +VERSION = '0.0.14' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index cba4e3f..353d122 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -290,7 +290,7 @@ def parse_cell_lines(self, sheet_name, action, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] - df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] @@ -369,7 +369,7 @@ def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] - df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] @@ -454,7 +454,7 @@ def parse_library_preparations(self, sheet_name, action, column_mapping): # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] - df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['library_preparation.biomaterial_core.biomaterial_id'] @@ -547,7 +547,7 @@ def parse_sequencing_files(self, sheet_name, action, column_mapping): # Filter rows where file_name is not null df = df[df['sequence_file.file_core.file_name'].notna()] - df = df.applymap(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['sequence_file.file_core.file_name'] From 0bdb753bbf4a70530ad4c62dce4e8b014efb90e9 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 3 Jul 2024 11:39:52 +0100 Subject: [PATCH 23/55] pre-deployment small bug fixes --- ait/commons/util/command/submit.py | 16 +++-- ait/commons/util/command/submit_file.py | 82 ++++++++++++++----------- ait/commons/util/spreadsheet_util.py | 8 +-- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 66bf56a..5038a1e 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -107,7 +107,7 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") else: - cell_line_entity_id_column_name = "Identifier" + cell_line_entity_id_column_name = "Id" if cell_line_entity_id_column_name not in cell_lines_df.columns: cell_lines_df[cell_line_entity_id_column_name] = np.nan @@ -168,7 +168,7 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce submission_envelope_id) differentiated_biomaterial_to_entity_id_map = {} - differentiated_cell_line_entity_id_column_name = "Identifier" + differentiated_cell_line_entity_id_column_name = "Id" if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan @@ -289,7 +289,7 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati get_process_content('library_preparation'), submission_envelope_id) - library_preparation_entity_id_column_name = "Identifier" + library_preparation_entity_id_column_name = "Id" self.perform_hal_linkage( f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", @@ -340,7 +340,7 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, get_process_content('sequencing'), submission_envelope_id) - sequencing_file_entity_id_column_name = "Identifier" + sequencing_file_entity_id_column_name = "Id" if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan @@ -784,9 +784,13 @@ def create_new_submission_envelope(self, url, access_token): } response = requests.post(url, headers=headers, json={}) - response_data = response.json() + status_code = response.status_code - return response_data + if status_code == 200 or status_code == 201: + response_data = response.json() + return response_data, response.status_code + else: + return None, status_code def perform_hal_linkage(self, url, input_id, link_to, access_token): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index af35c2e..915ca4d 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -83,39 +83,43 @@ class CmdSubmitFile: submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" - def __init__(self, args): - """ - Initialize CmdSubmitFile instance. + class CmdSubmitFile: + def __init__(self, args): + """ + Initialize CmdSubmitFile instance. + + Args: + args: Command-line arguments passed to the script. + """ + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.user_profile = get_profile('morphic-util') + self.aws = Aws(self.user_profile) + self.provider_api = APIProvider(self.base_url) + + if hasattr(self.args, 'action') and self.args.action is not None: + self.action = self.args.action + else: + print("Submission action (ADD, MODIFY or DELETE) is mandatory") + return - Args: - args: Command-line arguments passed to the script. - """ - self.args = args - self.access_token = get_profile('morphic-util').access_token - self.user_profile = get_profile('morphic-util') - self.aws = Aws(self.user_profile) - self.provider_api = APIProvider(self.base_url) - - if hasattr(self.args, 'action') and self.args.action is not None: - self.action = self.args.action - else: - raise Exception("Submission action (ADD, MODIFY or DELETE) is mandatory") - - if hasattr(self.args, 'dataset') and self.args.dataset is not None: - self.dataset = self.args.dataset - else: - raise Exception("Dataset is mandatory to be registered before submitting dataset metadata, " - "We request you to submit your study using the submit option, register your" - "dataset using the same option and link your dataset to your study" - "before proceeding with this submission.") - - if hasattr(self.args, 'file') and self.args.file is not None: - self.file = self.args.file - else: - if self.action != 'DELETE': - raise Exception("File is mandatory") + if hasattr(self.args, 'dataset') and self.args.dataset is not None: + self.dataset = self.args.dataset else: - print("Deleting dataset " + self.dataset) + print("Dataset is mandatory to be registered before submitting dataset metadata, " + "We request you to submit your study using the submit option, register your" + "dataset using the same option and link your dataset to your study" + "before proceeding with this submission.") + return + + if hasattr(self.args, 'file') and self.args.file is not None: + self.file = self.args.file + else: + if self.action != 'DELETE': + print("File is mandatory") + return + else: + print("Deleting dataset " + self.dataset) def run(self): """ @@ -153,15 +157,23 @@ def run(self): # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) + submission_envelope_id = None if self.action == 'add' or self.action == 'ADD': - submission_envelope_response = submission_instance.create_new_submission_envelope( + submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( self.submission_envelope_create_url, access_token=self.access_token) - self_url = submission_envelope_response['_links']['self']['href'] - submission_envelope_id = get_id_from_url(self_url) + if status_code == 200 or status_code == 201: + self_url = submission_envelope_response['_links']['self']['href'] + submission_envelope_id = get_id_from_url(self_url) - print("Submission envelope for this submission is: " + submission_envelope_id) + print("Submission envelope for this submission is: " + submission_envelope_id) + else: + if status_code == 401: + message = "Unauthorized, refresh your tokens using the config option" + return False, message + else: + return False, "Encountered failure with " + status_code else: submission_envelope_id = None diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 353d122..40c651d 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -328,7 +328,7 @@ def parse_cell_lines(self, sheet_name, action, column_mapping): protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), zygosity=row.get('cell_line.zygosity'), cell_type=cell_type, - id=row.get('Identifier') + id=row.get('Id') ) ) @@ -408,7 +408,7 @@ def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), model_system=row.get('differentiated_cell_line.model_organ.text'), - id=row.get('Identifier') + id=row.get('Id') ) ) @@ -501,7 +501,7 @@ def parse_library_preparations(self, sheet_name, action, column_mapping): concentration_unit=row.get('library_preparation.concentration_unit'), pcr_cycles=row.get('library_preparation.pcr_cycles'), pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index'), - id=row.get('Identifier') + id=row.get('Id') ) ) @@ -587,7 +587,7 @@ def parse_sequencing_files(self, sheet_name, action, column_mapping): sequencing_protocol_id=sequencing_protocol_id, read_index=read_index, run_id=row.get('sequence_file.run_id'), - id=row.get('Identifier') + id=row.get('Id') ) ) From a3cb8285679bbf60e5fe69c8106cf4472d6f839e Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 4 Jul 2024 11:40:24 +0100 Subject: [PATCH 24/55] pre-deployment small bug fixes --- README.md | 38 ++++-- ait/commons/util/command/submit.py | 44 ++++++- ait/commons/util/command/submit_file.py | 146 +++++++++++------------- ait/commons/util/provider_api_util.py | 60 ++++++++-- setup.py | 32 ++++-- 5 files changed, 216 insertions(+), 104 deletions(-) diff --git a/README.md b/README.md index a846744..6a0f6b6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # morphic-util -CLI tool for uploading data to the Morphic AWS S3 buckets. +CLI tool for submitting analysis data and metadata # Users @@ -9,8 +9,8 @@ CLI tool for uploading data to the Morphic AWS S3 buckets. Users need to have 1. Basic command-line knowledge -2. Python3.x installed on their machine -3. AWS Cognito username and password +2. Python 3.10 installed on their machine +3. AWS Cognito username or email and password ## Install @@ -35,8 +35,10 @@ optional arguments: --version, -v show program's version number and exit command: - {config,create,select,list,upload,download,delete} + {config,submit,submit-file,create,select,list,upload,download,delete} config configure AWS credentials + submit submit your study, dataset or biomaterials metadata (incomplete as all metadata types is not supported yet, expected to be completed on August 2024) + submit-file submit your metadata file containing your cell lines, differentiated cell lines, library preparations and sequencing files create create an upload area (authorised users only) select select or show the active upload area list list contents of the area @@ -79,18 +81,40 @@ positional arguments: password AWS Cognito password ``` -The tool uses the profile name _hca-util_ in local AWS config files. +The tool uses the profile name _morphic-util_ in local AWS config files. + +## `submit` command +Submit your study and dataset metadata and create your AWS upload area for uploading data files + +```shell script +positional arguments: +$ morphic-util submit --type --file + + --type type of metadata being submitted (e.g. study or dataset) + --file path to the file containing the metadata +``` + +## `submit-file` command +Submit your study and dataset metadata and create your AWS upload area for uploading data files + +```shell script +positional arguments: +$ morphic-util submit-file --file --action --dataset + + --file path to the file containing the metadata + --action ADD, MODIFY or DELETE based on the type of submission + --dataset the identifier for the analysis +``` ## `create` command Create an upload area/ project folder **(authorised users only)** ```shell script -$ morphic-util create NAME DPC [-p {u,ud,ux,udx}] +$ morphic-util create NAME [-p {u,ud,ux,udx}] positional arguments: NAME name for the new area/ project folder - DPC center name of the submitter optional arguments: -p {u,ud,ux,udx} allowed actions (permissions) on new area. u for diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 5038a1e..50c0d5e 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -867,8 +867,46 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces def delete_dataset(self, dataset, access_token): # delete_to_provider_api(self.base_url + '/' + dataset, access_token) + """ + self.provider_api.delete_to_provider_api_including_linked_entities( + self.base_url + '/' + 'datasets' + '/' + dataset, + access_token, True) + """ + + # Fetch the dataset from the provider API + fetched_dataset = self.provider_api.get_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, + access_token) + print(f"Dataset fetched successfully {dataset}") + print(f"Initiating delete of {dataset}") + + # Extract lists of biomaterials, processes, and data files from the dataset + biomaterials = fetched_dataset.get('biomaterials', []) + processes = fetched_dataset.get('processes', []) + data_files = fetched_dataset.get('dataFiles', []) + + # Print a message indicating deletion of biomaterials + print("Deleting Biomaterials:") + + # Iterate over biomaterials and delete each one + for biomaterial in biomaterials: + print(f"Deleting {biomaterial}") + self.provider_api.delete_to_provider_api(self.base_url + '/' + 'biomaterials' + '/' + biomaterial, + access_token) + + # Print a message indicating deletion of processes + print("\nDeleting Processes:") + # Iterate over processes and delete each one + for process in processes: + print(f"Deleting {process}") + self.provider_api.delete_to_provider_api(self.base_url + '/' + 'processes' + '/' + process, access_token) + + # Print a message indicating deletion of data files + print("\nDeleting Data Files:") + # Iterate over data files and delete each one + for data_file in data_files: + print(f"Deleting {data_file}") + self.provider_api.delete_to_provider_api(self.base_url + '/' + 'files' + '/' + data_file, access_token) - self.provider_api.delete_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, - access_token, True) + print(f"\nDeleting the dataset: {dataset}") - print(f"Dataset deleted successfully {dataset}") + self.provider_api.delete_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 915ca4d..371f649 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,7 +1,5 @@ # Import necessary modules/classes from ait.commons.util package - import pandas as pd - from ait.commons.util.aws_client import Aws from ait.commons.util.command.list import CmdList from ait.commons.util.command.submit import CmdSubmit, get_id_from_url @@ -21,8 +19,10 @@ def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, da break # Exit the inner loop if a match is found if not match_found: - raise Exception(f"No matching file found for sequencing file: {sequencing_file.file_name} in the " - f"upload area for the dataset: {dataset}") + raise Exception( + f"No matching file found for sequencing file: {sequencing_file.file_name} " + f"in the upload area for the dataset: {dataset}" + ) class CmdSubmitFile: @@ -79,47 +79,48 @@ class CmdSubmitFile: "Unnamed: 5": None # Adjust index based on your actual column count } - base_url = 'https://api.ingest.dev.archive.morphic.bio/' + base_url = 'https://api.ingest.dev.archive.morphic.bio' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" - class CmdSubmitFile: - def __init__(self, args): - """ - Initialize CmdSubmitFile instance. - - Args: - args: Command-line arguments passed to the script. - """ - self.args = args - self.access_token = get_profile('morphic-util').access_token - self.user_profile = get_profile('morphic-util') - self.aws = Aws(self.user_profile) - self.provider_api = APIProvider(self.base_url) - - if hasattr(self.args, 'action') and self.args.action is not None: - self.action = self.args.action - else: - print("Submission action (ADD, MODIFY or DELETE) is mandatory") - return + def __init__(self, args): + """ + Initialize CmdSubmitFile instance. - if hasattr(self.args, 'dataset') and self.args.dataset is not None: - self.dataset = self.args.dataset - else: - print("Dataset is mandatory to be registered before submitting dataset metadata, " - "We request you to submit your study using the submit option, register your" - "dataset using the same option and link your dataset to your study" - "before proceeding with this submission.") + Args: + args: Command-line arguments passed to the script. + """ + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.user_profile = get_profile('morphic-util') + self.aws = Aws(self.user_profile) + self.provider_api = APIProvider(self.base_url) + + if hasattr(self.args, 'action') and self.args.action is not None: + self.action = self.args.action + else: + print("Submission action (ADD, MODIFY or DELETE) is mandatory") + return + + if hasattr(self.args, 'dataset') and self.args.dataset is not None: + self.dataset = self.args.dataset + else: + print( + "Dataset is mandatory to be registered before submitting dataset metadata, " + "We request you to submit your study using the submit option, register your " + "dataset using the same option and link your dataset to your study " + "before proceeding with this submission." + ) + return + + if hasattr(self.args, 'file') and self.args.file is not None: + self.file = self.args.file + else: + if self.action != 'DELETE': + print("File is mandatory") return - - if hasattr(self.args, 'file') and self.args.file is not None: - self.file = self.args.file else: - if self.action != 'DELETE': - print("File is mandatory") - return - else: - print("Deleting dataset " + self.dataset) + print(f"Deleting dataset {self.dataset}") def run(self): """ @@ -128,31 +129,31 @@ def run(self): submission_instance = CmdSubmit(self) if self.action == 'delete' or self.action == 'DELETE': + self.file = None submission_instance.delete_dataset(self.dataset, self.access_token) + return True, None list_instance = CmdList(self.aws, self.args) - - list_of_files_in_upload_area = (list_instance. - list_bucket_contents_and_return(self.dataset, '')) + list_of_files_in_upload_area = list_instance.list_bucket_contents_and_return(self.dataset, '') if self.file: # Initialize SpreadsheetParser with the provided file path parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, - self.cellline_column_mapping) + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.cellline_column_mapping) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.differentiated_cellline_column_mapping) + 'Differentiated cell line', self.action, self.differentiated_cellline_column_mapping + ) parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) - library_preparations, library_preparations_df = (parser - .get_library_preparations('Library preparation', - self.action, - self.library_preparation_column_mapping)) + library_preparations, library_preparations_df = parser.get_library_preparations( + 'Library preparation', self.action, self.library_preparation_column_mapping + ) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) - sequencing_files, sequencing_files_df = parser.get_sequencing_files('Sequence file', self.action, - self.sequencing_file_column_mapping) + sequencing_files, sequencing_files_df = parser.get_sequencing_files( + 'Sequence file', self.action, self.sequencing_file_column_mapping + ) # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) @@ -161,51 +162,42 @@ def run(self): if self.action == 'add' or self.action == 'ADD': submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( - self.submission_envelope_create_url, - access_token=self.access_token) + self.submission_envelope_create_url, access_token=self.access_token + ) if status_code == 200 or status_code == 201: self_url = submission_envelope_response['_links']['self']['href'] submission_envelope_id = get_id_from_url(self_url) - - print("Submission envelope for this submission is: " + submission_envelope_id) + print(f"Submission envelope for this submission is: {submission_envelope_id}") else: if status_code == 401: message = "Unauthorized, refresh your tokens using the config option" return False, message else: - return False, "Encountered failure with " + status_code + return False, f"Encountered failure with {status_code}" else: submission_envelope_id = None # Perform the submission and get the updated dataframes try: - (updated_cell_lines_df, updated_differentiated_cell_lines_df, - updated_library_preparations_df, - updated_sequencing_files_df, message) = submission_instance.multi_type_submission( - cell_lines, - cell_lines_df, - differentiated_cell_lines_df, - library_preparations_df, - sequencing_files_df, - submission_envelope_id, - self.dataset, - self.access_token, - self.action + ( + updated_cell_lines_df, updated_differentiated_cell_lines_df, + updated_library_preparations_df, updated_sequencing_files_df, + message + ) = submission_instance.multi_type_submission( + cell_lines, cell_lines_df, differentiated_cell_lines_df, + library_preparations_df, sequencing_files_df, submission_envelope_id, + self.dataset, self.access_token, self.action ) # Save the updated dataframes to a single Excel file with multiple sheets if message == 'SUCCESS': - output_file = "updated_cell_lines.xlsx" + output_file = "submission-result.xlsx" with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_cell_lines_df.to_excel(writer, - sheet_name='Cell line', index=False) - updated_differentiated_cell_lines_df.to_excel(writer, - sheet_name='Differentiated cell line', + updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) + updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', index=False) - updated_library_preparations_df.to_excel(writer, - sheet_name='Library preparation', index=False) - updated_sequencing_files_df.to_excel(writer, - sheet_name='Sequence file', index=False) + updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', index=False) + updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) return True, message else: diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py index 14c47c5..774dad6 100644 --- a/ait/commons/util/provider_api_util.py +++ b/ait/commons/util/provider_api_util.py @@ -19,31 +19,73 @@ def send_request(self, method, url, access_token, params=None, data=None, data_t Returns: dict or str: The response data for PUT/DELETE or the URL for POST requests. + + Behavior + + Headers Setup: Constructs the request headers, setting Content-Type to application/json and adding the Authorization header with the provided access_token. + + Sending Request: Sends the HTTP request using the requests.request method with the specified method, URL, headers, params, and data. + + Response Handling: + Checks the status code of the response. + + If the status code is not one of 200, 201, 202, or 204, prints an error message and: + For DELETE requests, returns None. + For other methods, raises an exception using response.raise_for_status(). + + For POST requests with a data_type_in_hal_link provided, + returns the URL from the _links section of the response. + + For DELETE requests, returns the status code. + + For other successful requests, returns the JSON-parsed response data. """ + # Construct the request headers headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {access_token}' } + # Send the HTTP request response = requests.request(method, url, headers=headers, params=params, json=data) + status_code = response.status_code - # Check for successful response - if not response.ok: - response.raise_for_status() - - response_data = response.json() + # Check for unsuccessful status codes + if status_code not in (200, 201, 202, 204): + print(f"Received {status_code} while executing {method} on {url}") + if method == 'DELETE': + # Return None for unsuccessful DELETE requests + return None + else: + # Raise an exception for other unsuccessful requests + raise response.raise_for_status() + else: + print(f"Received {status_code} while executing {method} on {url}") + # Handle POST requests with data_type_in_hal_link if method == 'POST' and data_type_in_hal_link: + response_data = response.json() + # Return the URL from the HAL link in the response return response_data['_links'][data_type_in_hal_link]['href'] - return response_data + elif method == 'DELETE': + # Return the status code for DELETE requests + return status_code + else: + # Return the JSON-parsed response data for other successful requests + return response.json() def put_to_provider_api(self, url, access_token): return self.send_request('PUT', url, access_token) - # TODO: have a generic delete and also a delete with params - def delete_to_provider_api(self, url, access_token, delete_linked_entities=False): + def get_to_provider_api(self, url, access_token): + return self.send_request('GET', url, access_token) + + def delete_to_provider_api_including_linked_entities(self, url, access_token, delete_linked_entities=False): params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} - self.send_request('DELETE', url, access_token, params=params) + return self.send_request('DELETE', url, access_token, params=params) + + def delete_to_provider_api(self, url, access_token): + return self.send_request('DELETE', url, access_token) def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): return self.send_request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) diff --git a/setup.py b/setup.py index 49ee058..2b9c530 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,31 @@ from setuptools import setup from ait.commons.util.settings import NAME, VERSION, DESC, AUTHOR, AUTHOR_EMAIL -# directory containing this file +# Directory containing this file HERE = pathlib.Path(__file__).parent -# text of the README file +# Text of the README file README = (HERE / 'README.md').read_text() -# install requirements +# Install requirements from requirements.txt INSTALL_REQS = [line.rstrip() for line in open(os.path.join(os.path.dirname(__file__), 'requirements.txt'))] +# Additional install requirements +ADDITIONAL_REQS = [ + 'boto3>=1.23.10', + 'botocore>=1.26.10', + 'filetype==1.0.7', + 'requests>=2.20.0, <3', + 'urllib3<1.27, >=1.25.4', + 'tqdm~=4.57.0', + 'pandas~=2.2.2', + 'setuptools~=59.6.0', + 'openpyxl==3.1.3' +] + +# Combine the install requirements +ALL_REQS = INSTALL_REQS + ADDITIONAL_REQS + # This call to setup() does all the work setup( # dashes are ok in repo and PyPI dist names but not in package (i.e. directory) and @@ -25,18 +41,18 @@ author=AUTHOR, author_email=AUTHOR_EMAIL, license='Apache License', - python_requires='>=3.6', + python_requires='>=3.10', classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Operating System :: MacOS :: MacOS X', 'Operating System :: POSIX', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], platforms=['MacOS X', 'Posix'], - packages=['ait.commons.util','ait.commons.util.settings', 'ait.commons.util.command'], + packages=['ait.commons.util', 'ait.commons.util.settings', 'ait.commons.util.command'], include_package_data=True, - install_requires=INSTALL_REQS, + install_requires=ALL_REQS, entry_points={ 'console_scripts': [ f'{NAME}=ait.commons.util.__main__:main', From 0a9b070216a0167bc586abd9e938b36defff3b6a Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 5 Jul 2024 09:41:18 +0100 Subject: [PATCH 25/55] pre-deployment small bug fixes --- ait/commons/util/command/submit_file.py | 2 +- ait/commons/util/settings/morphic_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 371f649..6cda10a 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -155,7 +155,7 @@ def run(self): 'Sequence file', self.action, self.sequencing_file_column_mapping ) - # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) + validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) submission_envelope_id = None diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 2b9d695..ea92fac 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.14' +VERSION = '0.0.16' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From ac27181ab3f0e29c0fb4a3a312124aa20e19d323 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 5 Jul 2024 10:20:27 +0100 Subject: [PATCH 26/55] use new cognito --- ait/commons/util/settings/morphic_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index ea92fac..15778ed 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -33,9 +33,9 @@ # Cognito and IAM COGNITO_MORPHIC_UTIL_ADMIN = 'morphic-dev-admin' -COGNITO_CLIENT_ID = '178j951qnfuheicm2m5rqqvg6q' +COGNITO_CLIENT_ID = '1rfis94rvnden5elmocospd256' COGNITO_IDENTITY_POOL_ID = 'eu-west-2:d6531e9c-020d-4ee8-bf3b-255393c500e9' -COGNITO_USER_POOL_ID = 'eu-west-2_b4EyaLNCM' +COGNITO_USER_POOL_ID = 'eu-west-2_Aqtqtg7u7' IAM_USER = 'morphic-dev-admin' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' From 950d61a876e95f96983e205ca5ce9a639c8686b6 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 5 Jul 2024 10:21:37 +0100 Subject: [PATCH 27/55] version increment --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 15778ed..42d7519 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.16' +VERSION = '0.0.17' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 4a87c8c9935ecb6306d852f45334c42deb5a2011 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 8 Aug 2024 15:29:15 +0100 Subject: [PATCH 28/55] defect fixes --- ait/commons/util/command/submit.py | 274 ++++++++++-------------- ait/commons/util/command/submit_file.py | 71 +----- ait/commons/util/spreadsheet_util.py | 139 +++++++++--- 3 files changed, 230 insertions(+), 254 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 50c0d5e..92eb26d 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -492,39 +492,21 @@ def typed_submission(self, type, file, access_token): tuple: A tuple containing a boolean indicating success and the ID of the created entity. """ if type in ['study', 'dataset', 'biomaterial', 'process', 'file']: - if file is not None: - data = self.transform(file) - else: - data = {} + data = self.transform(file) if file is not None else {} entity_id = self.create_new_envelope_and_submit_entity(type, data, access_token) - if entity_id is not None: + if entity_id: if type == 'dataset': - if self.args.study is not None: - study_id = self.args.study - self.link_dataset_to_study(entity_id, study_id, access_token) - else: - link_to_study = input("Do you want to link this dataset to a study? " - "(yes/no): ").lower() - if link_to_study == 'yes': - study_id = input("Input study id: ").lower() - self.link_dataset_to_study(entity_id, study_id, access_token) + study_id = self.args.study or input("Input study id to link this dataset: ").lower() + self.link_dataset_to_study(entity_id, study_id, access_token) elif type == 'biomaterial': - if self.args.dataset is not None: - dataset_id = self.args.dataset - self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) - else: - link_to_dataset = input("Do you want to link this biomaterial to a " - "dataset? (yes/no): ").lower() - if link_to_dataset == 'yes': - dataset_id = input("Input dataset id: ").lower() - self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) - - # Linking biomaterial to process - if self.args.process is not None: - process_id = self.args.process + dataset_id = self.args.dataset or input("Input dataset id to link this biomaterial: ").lower() + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + + process_id = self.args.process + if process_id: self.link_biomaterial_to_process(entity_id, process_id, access_token) return True, entity_id @@ -537,29 +519,27 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ Creates and submits a new entity (study, dataset, biomaterial, or process) and returns its ID. Parameters: - input_entity_type (str): The type of entity to create ('study', 'dataset', - 'biomaterial', 'process'). + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). data (dict): The data to be submitted. access_token (str): Access token for authorization. Returns: str: The ID of the created entity. """ - if input_entity_type == 'study': - halEntity = 'studies' - elif input_entity_type == 'dataset': - halEntity = 'datasets' - elif input_entity_type == 'biomaterial': - halEntity = 'biomaterials' - elif input_entity_type == 'process': - halEntity = 'processes' - - entity_create_url_from_sub_env_hal_links = (self. - post_to_provider_api(self.submission_envelope_create_url, - halEntity, None, - access_token)) - entity_self_hal_link = self.post_to_provider_api(entity_create_url_from_sub_env_hal_links, - 'self', data, access_token) + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return None + + entity_create_url = self.post_to_provider_api(self.submission_envelope_create_url, hal_entity, None, + access_token) + entity_self_hal_link = self.post_to_provider_api(entity_create_url, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -567,31 +547,33 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ return entity_id def patchEntity(self, input_entity_type, id, data, access_token): - if input_entity_type == 'study': - halEntity = 'studies' - elif input_entity_type == 'dataset': - halEntity = 'datasets' - elif input_entity_type == 'biomaterial': - halEntity = 'biomaterials' - elif input_entity_type == 'process': - halEntity = 'processes' - elif input_entity_type == 'file': - halEntity = 'files' - - entity_patch_url = self.base_url + '/' + halEntity + '/' + id + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return False + entity_patch_url = f"{self.base_url}/{hal_entity}/{id}" return self.patch_to_provider_api(entity_patch_url, data, access_token) def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token): - if input_entity_type == 'biomaterial': - halEntity = 'biomaterials' - elif input_entity_type == 'process': - halEntity = 'processes' - elif input_entity_type == 'file': - halEntity = 'files' + entity_map = { + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) - put_url = self.base_url + '/' + 'datasets' + '/' + dataset_id + '/' + halEntity + '/' + entity_id + if not hal_entity: + return False + put_url = f"{self.base_url}/datasets/{dataset_id}/{hal_entity}/{entity_id}" return self.provider_api.put_to_provider_api(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): @@ -601,19 +583,14 @@ def patch_to_provider_api(self, entity_patch_url, data, access_token): } response = requests.patch(entity_patch_url, headers=headers, json=data) + return response.status_code // 100 == 2 - if response.status_code // 100 == 2: - return True - return False - - def use_existing_envelope_and_submit_entity(self, input_entity_type, data, - submission_envelope_id, access_token): + def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submission_envelope_id, access_token): """ Submits an entity using an existing submission envelope and returns its ID. Parameters: - input_entity_type (str): The type of entity to create ('study', - 'dataset', 'biomaterial', 'process'). + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). data (dict): The data to be submitted. submission_envelope_id (str): ID of the submission envelope. access_token (str): Access token for authorization. @@ -621,22 +598,20 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, Returns: str: The ID of the created entity. """ - if input_entity_type == 'study': - halEntity = 'studies' - elif input_entity_type == 'dataset': - halEntity = 'datasets' - elif input_entity_type == 'biomaterial': - halEntity = 'biomaterials' - elif input_entity_type == 'process': - halEntity = 'processes' - elif input_entity_type == 'file': - halEntity = 'files' - - entity_create_url_from_sub_env_hal_links = (self.submission_envelope_base_url - + "/" + submission_envelope_id - + "/" + halEntity) - entity_self_hal_link = self.post_to_provider_api(entity_create_url_from_sub_env_hal_links, - 'self', data, access_token) + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return None + + entity_create_url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/{hal_entity}" + entity_self_hal_link = self.post_to_provider_api(entity_create_url, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -654,9 +629,8 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): """ print(f"Linking dataset {dataset_id} to study {study_id}") - self.provider_api.put_to_provider_api( - f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}", - access_token) + url = f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}" + self.provider_api.put_to_provider_api(url, access_token) print(f"Dataset linked successfully to study: {study_id}") @@ -671,8 +645,8 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") - self.provider_api.put_to_provider_api( - f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}", access_token) + url = f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" + self.provider_api.put_to_provider_api(url, access_token) print(f"Biomaterial linked successfully to dataset: {dataset_id}") @@ -687,9 +661,8 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to process {process_id}") - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses", - process_id, 'processes', access_token) + url = f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses" + self.perform_hal_linkage(url, process_id, 'processes', access_token) def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): """ @@ -720,12 +693,12 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F Sends a DELETE request to delete a submission envelope. Parameters: - - submission_envelope_id (str): ID of the submission envelope to delete. - - access_token (str): Access token for authorization. - - force_delete (bool): Whether to force delete the submission envelope (default: False). + submission_envelope_id (str): ID of the submission envelope to delete. + access_token (str): Access token for authorization. + force_delete (bool): Whether to force delete the submission envelope (default: False). Returns: - - str: The URL from the response. + bool: True if the deletion was successful, False otherwise. """ url = f"{self.submission_envelope_base_url}/{submission_envelope_id}" headers = { @@ -733,20 +706,15 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F 'Authorization': f'Bearer {access_token}' } - params = { - 'force': 'true' if force_delete else 'false' - } + params = {'force': str(force_delete).lower()} response = requests.delete(url, headers=headers, params=params) - # Check if the status code indicates success (2xx) - if response.status_code // 100 == 2: - return True - return False + return response.status_code // 100 == 2 def post_to_provider_api_and_get_entity_id(self, url, data, access_token): """ - Sends a POST request to the specified URL. + Sends a POST request to the specified URL and returns the entity ID from the response. Parameters: url (str): The URL to send the request to. @@ -754,7 +722,7 @@ def post_to_provider_api_and_get_entity_id(self, url, data, access_token): access_token (str): Access token for authorization. Returns: - str: The URL from the response. + str: The entity ID extracted from the response URL. """ headers = { 'Content-Type': 'application/json', @@ -763,9 +731,9 @@ def post_to_provider_api_and_get_entity_id(self, url, data, access_token): response = requests.post(url, headers=headers, json=data) response_data = response.json() - url = response_data['_links']['self']['href'] + entity_url = response_data['_links']['self']['href'] - return get_id_from_url(url) + return get_id_from_url(entity_url) def create_new_submission_envelope(self, url, access_token): """ @@ -776,7 +744,7 @@ def create_new_submission_envelope(self, url, access_token): access_token (str): Access token for authorization. Returns: - dict: The response data. + tuple: A tuple containing the response data and the status code. """ headers = { 'Content-Type': 'application/json', @@ -786,15 +754,15 @@ def create_new_submission_envelope(self, url, access_token): response = requests.post(url, headers=headers, json={}) status_code = response.status_code - if status_code == 200 or status_code == 201: + if status_code in {200, 201}: response_data = response.json() - return response_data, response.status_code - else: - return None, status_code + return response_data, status_code + + return None, status_code def perform_hal_linkage(self, url, input_id, link_to, access_token): """ - Performs HAL linkage. + Performs HAL linkage by sending a POST request. Parameters: url (str): The URL to send the request to. @@ -802,22 +770,21 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): link_to (str): The entity to link to. access_token (str): Access token for authorization. - Returns: - dict: The response data. + Raises: + Exception: If the linkage fails. """ headers = { 'Content-Type': 'text/uri-list', 'Authorization': f'Bearer {access_token}' } - response = requests.post(url, headers=headers, - data=f"{self.base_url}/{link_to}/{input_id}") + response = requests.post(url, headers=headers, data=f"{self.base_url}/{link_to}/{input_id}") if response.status_code != 200: raise Exception(f"Failed to link biomaterial to process {input_id}. " f"Status code: {response.status_code}, Response: {response.text}") else: - print("linkage successful") + print("Linkage successful") def transform(self, file): """ @@ -829,84 +796,75 @@ def transform(self, file): Returns: dict: The JSON object. """ - if self.args.file.endswith('.tsv'): + if file.endswith('.tsv'): json_data = [] with open(file, 'r', newline='') as file: reader = csv.DictReader(file, delimiter='\t') for row in reader: json_data.append(row) - json_data_formatted = {'content': json_data} - data = json_data_formatted + return {'content': json_data} + elif file.endswith('.csv'): df = pd.read_csv(file) - data = {'content': df.to_dict(orient='records')} + return {'content': df.to_dict(orient='records')} + else: with open(file, 'r') as file: - data = json.load(file) - return data + return json.load(file) def create_child_biomaterial(self, cell_line_entity_id, body, access_token): - url = self.base_url + '/' + 'biomaterials' + '/' + cell_line_entity_id + '/' + 'childBiomaterials' + url = f"{self.base_url}/biomaterials/{cell_line_entity_id}/childBiomaterials" entity_id = self.post_to_provider_api_and_get_entity_id(url, body, access_token) return entity_id def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, access_token): - # TODO: handle other types - global url + """ + Links an entity to a submission envelope. + Parameters: + type (str): The type of the entity (e.g., 'biomaterial', 'file'). + entity_id (str): The ID of the entity to link. + submission_envelope_id (str): The ID of the submission envelope. + access_token (str): Access token for authorization. + """ if type == 'biomaterial': - url = (self.submission_envelope_base_url + '/' + submission_envelope_id + - '/' + 'biomaterials' + '/' + entity_id) + url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" elif type == 'file': - url = (self.submission_envelope_base_url + '/' + submission_envelope_id + - '/' + 'files' + '/' + entity_id) + url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" - # self.put_to_provider_api(url, access_token) self.provider_api.put_to_provider_api(url, access_token) def delete_dataset(self, dataset, access_token): - # delete_to_provider_api(self.base_url + '/' + dataset, access_token) - """ - self.provider_api.delete_to_provider_api_including_linked_entities( - self.base_url + '/' + 'datasets' + '/' + dataset, - access_token, True) """ + Deletes a dataset along with its associated biomaterials, processes, and data files. - # Fetch the dataset from the provider API - fetched_dataset = self.provider_api.get_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, - access_token) - print(f"Dataset fetched successfully {dataset}") + Parameters: + dataset (str): The ID of the dataset to delete. + access_token (str): Access token for authorization. + """ + fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) + print(f"Dataset fetched successfully: {dataset}") print(f"Initiating delete of {dataset}") - # Extract lists of biomaterials, processes, and data files from the dataset biomaterials = fetched_dataset.get('biomaterials', []) processes = fetched_dataset.get('processes', []) data_files = fetched_dataset.get('dataFiles', []) - # Print a message indicating deletion of biomaterials print("Deleting Biomaterials:") - - # Iterate over biomaterials and delete each one for biomaterial in biomaterials: print(f"Deleting {biomaterial}") - self.provider_api.delete_to_provider_api(self.base_url + '/' + 'biomaterials' + '/' + biomaterial, - access_token) + self.provider_api.delete_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", access_token) - # Print a message indicating deletion of processes print("\nDeleting Processes:") - # Iterate over processes and delete each one for process in processes: print(f"Deleting {process}") - self.provider_api.delete_to_provider_api(self.base_url + '/' + 'processes' + '/' + process, access_token) + self.provider_api.delete_to_provider_api(f"{self.base_url}/processes/{process}", access_token) - # Print a message indicating deletion of data files print("\nDeleting Data Files:") - # Iterate over data files and delete each one for data_file in data_files: print(f"Deleting {data_file}") - self.provider_api.delete_to_provider_api(self.base_url + '/' + 'files' + '/' + data_file, access_token) + self.provider_api.delete_to_provider_api(f"{self.base_url}/files/{data_file}", access_token) print(f"\nDeleting the dataset: {dataset}") - - self.provider_api.delete_to_provider_api(self.base_url + '/' + 'datasets' + '/' + dataset, access_token) + self.provider_api.delete_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 6cda10a..b7c8452 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -26,59 +26,6 @@ def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, da class CmdSubmitFile: - # Column mappings for parsing different sections of the spreadsheet - cellline_column_mapping = { - "CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", - "CELL LINE DESCRIPTION": "cell_line.biomaterial_core.biomaterial_description", - "DERIVED FROM CELL LINE NAME (Required)": "cell_line.derived_cell_line_accession", - "CLONE ID": "cell_line.clone_id", - "GENE EXPRESSION ALTERATION PROTOCOL ID": "gene_expression_alteration_protocol.protocol_core.protocol_id", - "ZYGOSITY": "cell_line.zygosity", - "CELL LINE TYPE (Required)": "cell_line.type", - "Unnamed: 7": None, - "Unnamed: 8": None - } - - differentiated_cellline_column_mapping = { - "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", - "DIFFERENTIATED CELL LINE DESCRIPTION": "differentiated_cell_line.biomaterial_core.biomaterial_description", - "INPUT CELL LINE ID (Required)": "cell_line.biomaterial_core.biomaterial_id", - "DIFFERENTIATION PROTOCOL ID (Required)": "differentiation_protocol.protocol_core.protocol_id", - "TIMEPOINT VALUE": "differentiated_cell_line.timepoint_value", - "TIMEPOINT UNIT": "differentiated_cell_line.timepoint_unit.text", - "TERMINALLY DIFFERENTIATED": "differentiated_cell_line.terminally_differentiated", - "FINAL LINEAGE STAGE": "differentiated_cell_line.terminally_differentiated", - "Model System": "cell_line.model_organ.text", - "MODEL SYSTEM": "cell_line.model_organ.text", - "Unnamed: 8": None - } - - library_preparation_column_mapping = { - "LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", - "LIBRARY PREPARATION PROTOCOL ID (Required)": "library_preparation_protocol.protocol_core.protocol_id", - "DISSOCIATION PROTOCOL ID (Required)": "dissociation_protocol.protocol_core.protocol_id", - "DIFFERENTIATED CELL LINE ID (Required)": "differentiated_cell_line.biomaterial_core.biomaterial_id", - "LIBRARY AVERAGE FRAGMENT SIZE": "library_preparation.average_fragment_size", - "LIBRARY INPUT AMOUNT VALUE": "library_preparation.input_amount_value", - "LIBRARY INPUT AMOUNT UNIT": "library_preparation.input_amount_unit", - "LIBRARY FINAL YIELD VALUE": "library_preparation.final_yield_value", - "LIBRARY FINAL YIELD UNIT": "library_preparation.final_yield_unit", - "LIBRARY CONCENTRATION VALUE": "library_preparation.concentration_value", - "LIBRARY CONCENTRATION UNIT": "library_preparation.concentration_unit", - "LIBRARY PCR CYCLES": "library_preparation.pcr_cycles", - "LIBRARY PCR CYCLES FOR SAMPLE INDEX": "library_preparation.pcr_cycles_for_sample_index", - "Unnamed: 14": None # Adjust index based on your actual column count - } - - sequencing_file_column_mapping = { - "FILE NAME (Required)": "sequence_file.file_core.file_name", - "INPUT LIBRARY PREPARATION ID (Required)": "library_preparation.biomaterial_core.biomaterial_id", - "SEQUENCING PROTOCOL ID (Required)": "sequencing_protocol.protocol_core.protocol_id", - "READ INDEX (Required)": "sequence_file.read_index", - "RUN ID": "sequence_file.run_id", - "Unnamed: 5": None # Adjust index based on your actual column count - } - base_url = 'https://api.ingest.dev.archive.morphic.bio' submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" submission_envelope_base_url = f"{base_url}/submissionEnvelopes" @@ -95,6 +42,7 @@ def __init__(self, args): self.user_profile = get_profile('morphic-util') self.aws = Aws(self.user_profile) self.provider_api = APIProvider(self.base_url) + self.errors = [] if hasattr(self.args, 'action') and self.args.action is not None: self.action = self.args.action @@ -141,21 +89,18 @@ def run(self): parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.cellline_column_mapping) + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.errors) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.differentiated_cellline_column_mapping - ) - parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) + 'Differentiated cell line', self.action) + parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.errors) library_preparations, library_preparations_df = parser.get_library_preparations( - 'Library preparation', self.action, self.library_preparation_column_mapping - ) + 'Library preparation', self.action) parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) sequencing_files, sequencing_files_df = parser.get_sequencing_files( - 'Sequence file', self.action, self.sequencing_file_column_mapping - ) + 'Sequence file', self.action) - validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) + # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) submission_envelope_id = None @@ -170,7 +115,7 @@ def run(self): print(f"Submission envelope for this submission is: {submission_envelope_id}") else: if status_code == 401: - message = "Unauthorized, refresh your tokens using the config option" + message = "Unauthorized, refresh your access token using the config option" return False, message else: return False, f"Encountered failure with {status_code}" diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 40c651d..5dd4519 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -9,7 +9,7 @@ def __init__(self, message): super().__init__(self.message) -class MissingEntityError(Exception): +class MissingParentEntityError(Exception): """Custom exception raised when an expected entity is missing.""" def __init__(self, missing_type, entity_type, missing_id): @@ -19,6 +19,13 @@ def __init__(self, missing_type, entity_type, missing_id): self.missing_id = missing_id +class OrphanedEntityError(Exception): + def __init__(self, type, id): + super().__init__(f"Orphaned entity {type} and ID is {id}") + self.type = type + self.id = id + + class CellLine: def __init__(self, biomaterial_id, description, derived_accession, clone_id, protocol_id, zygosity, cell_type, id): @@ -253,7 +260,7 @@ def list_sheets(self): xls = pd.ExcelFile(self.file_path, engine='openpyxl') return xls.sheet_names - def parse_cell_lines(self, sheet_name, action, column_mapping): + def parse_cell_lines(self, sheet_name, action, errors): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -274,11 +281,11 @@ def parse_cell_lines(self, sheet_name, action, column_mapping): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 0 + skip_rows = 3 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + # df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] @@ -312,12 +319,15 @@ def parse_cell_lines(self, sheet_name, action, column_mapping): # Check if biomaterial_id is null if pd.isnull(biomaterial_id): + errors.append("Biomaterial ID cannot be null.") raise MissingMandatoryFieldError("Biomaterial ID cannot be null.") # Check if derived_accession and cell_type are present if pd.isnull(derived_accession) or pd.isnull(cell_type): + errors.append(f"Mandatory fields (derived_accession, cell_type) are required. {biomaterial_id}") + raise MissingMandatoryFieldError( - "Mandatory fields (derived_accession, cell_type) are required. " + biomaterial_id) + f"Mandatory fields (derived_accession, cell_type) are required. {biomaterial_id}") cell_lines.append( CellLine( @@ -334,7 +344,7 @@ def parse_cell_lines(self, sheet_name, action, column_mapping): return cell_lines, df_filtered - def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): + def parse_differentiated_cell_lines(self, sheet_name, action): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -353,11 +363,11 @@ def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 0 + skip_rows = 3 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + # df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] @@ -414,7 +424,7 @@ def parse_differentiated_cell_lines(self, sheet_name, action, column_mapping): return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, action, column_mapping): + def parse_library_preparations(self, sheet_name, action): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -431,11 +441,11 @@ def parse_library_preparations(self, sheet_name, action, column_mapping): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 0 + skip_rows = 3 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + # df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] @@ -507,7 +517,7 @@ def parse_library_preparations(self, sheet_name, action, column_mapping): return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, action, column_mapping): + def parse_sequencing_files(self, sheet_name, action): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -524,11 +534,11 @@ def parse_sequencing_files(self, sheet_name, action, column_mapping): if action.upper() == 'MODIFY': skip_rows = 0 else: - skip_rows = 0 + skip_rows = 3 df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) df.columns = df.columns.str.strip() - df = df.rename(columns=column_mapping) + # df = df.rename(columns=column_mapping) # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] @@ -593,7 +603,7 @@ def parse_sequencing_files(self, sheet_name, action, column_mapping): return sequencing_files, df_filtered - def get_cell_lines(self, sheet_name, action, column_mapping): + def get_cell_lines(self, sheet_name, action, errors): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -609,10 +619,10 @@ def get_cell_lines(self, sheet_name, action, column_mapping): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action, column_mapping) + cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action, errors) return cell_lines, cell_lines_df - def get_differentiated_cell_lines(self, sheet_name, action, column_mapping): + def get_differentiated_cell_lines(self, sheet_name, action): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -630,11 +640,11 @@ def get_differentiated_cell_lines(self, sheet_name, action, column_mapping): """ differentiated_cell_lines, differentiated_cell_lines_df = (self. parse_differentiated_cell_lines - (sheet_name, action, - column_mapping)) + (sheet_name, action)) return differentiated_cell_lines, differentiated_cell_lines_df - def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiated_cell_lines): + def merge_cell_line_and_differentiated_cell_line(self, cell_lines, + differentiated_cell_lines, errors): """ Merges cell lines and differentiated cell lines based on their biomaterial IDs. @@ -654,13 +664,24 @@ def merge_cell_line_and_differentiated_cell_line(self, cell_lines, differentiate MissingEntityError: If a differentiated cell line does not have a corresponding cell line. """ + + self.find_orphans( + source_entities=cell_lines, + target_entities=differentiated_cell_lines, + source_attr="biomaterial_id", + target_attr="input_biomaterial_id", + source_type="Cell line", + target_type="Differentiated Cell line", + errors=errors + ) + cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} for differentiated_cell_line in differentiated_cell_lines: if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: - raise MissingEntityError("Cell Line", - "Differentiated cell line", - differentiated_cell_line.biomaterial_id) + raise MissingParentEntityError("Cell Line", + "Differentiated cell line", + differentiated_cell_line.biomaterial_id) for cell_line in cell_lines: for differentiated_cell_line in differentiated_cell_lines: @@ -687,13 +708,24 @@ def merge_differentiated_cell_line_and_library_preparation(self, differentiated_ MissingEntityError: If a library preparation does not have a corresponding differentiated cell line. """ + + self.find_orphans( + source_entities=differentiated_cell_lines, + target_entities=library_preparations, + source_attr="biomaterial_id", + target_attr="differentiated_biomaterial_id", + source_type="Differentiated Cell line", + target_type="Library Preparation", + errors=[] + ) + differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} for library_preparation in library_preparations: if library_preparation.differentiated_biomaterial_id not in differentiated_ids: - raise MissingEntityError("Differentiated Cell Line", - "Library preparation", - library_preparation.biomaterial_id) + raise MissingParentEntityError("Differentiated Cell Line", + "Library preparation", + library_preparation.biomaterial_id) for differentiated_cell_line in differentiated_cell_lines: for library_preparation in library_preparations: @@ -720,20 +752,60 @@ def merge_library_preparation_sequencing_file(self, library_preparations, sequen MissingEntityError: If a sequencing file does not have a corresponding library preparation. """ + self.find_orphans( + source_entities=library_preparations, + target_entities=sequencing_files, + source_attr="biomaterial_id", # Assuming this is the correct attribute + target_attr="library_preparation_id", + source_type="Library Preparation", + target_type="Sequencing File", + errors=[] + ) + library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} for sequencing_file in sequencing_files: if sequencing_file.library_preparation_id not in library_ids: - raise MissingEntityError("Library preparation", - "Sequencing file", - sequencing_file.file_name) + raise MissingParentEntityError("Library preparation", + "Sequencing file", + sequencing_file.file_name) for library_preparation in library_preparations: for sequencing_file in sequencing_files: if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: library_preparation.add_sequencing_file(sequencing_file) - def get_library_preparations(self, sheet_name, action, column_mapping): + def find_orphans(self, source_entities, target_entities, + source_attr, target_attr, source_type, target_type, errors): + """ + Validates that each source entity has a corresponding target entity. + + Parameters: + source_entities (list): The list of source entities. + target_entities (list): The list of target entities. + source_attr (str): The attribute name in the source entity to compare. + target_attr (str): The attribute name in the target entity to compare. + source_type (str): The type name of the source entity (for error messages). + target_type (str): The type name of the target entity (for error messages). + + Raises: + OrphanedEntityError: If a source entity doesn't have a corresponding target entity. + """ + for source_entity in source_entities: + match_found = False + + for target_entity in target_entities: + if getattr(target_entity, target_attr) == getattr(source_entity, source_attr): + match_found = True + break + + if not match_found: + errors.append(source_type + getattr(source_entity, source_attr)) + raise OrphanedEntityError(source_type, getattr(source_entity, source_attr)) + + print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") + + def get_library_preparations(self, sheet_name, action): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -749,10 +821,11 @@ def get_library_preparations(self, sheet_name, action, column_mapping): list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations, df_filtered = self.parse_library_preparations(sheet_name, action, column_mapping) + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, + action) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, action, column_mapping): + def get_sequencing_files(self, sheet_name, action): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -768,5 +841,5 @@ def get_sequencing_files(self, sheet_name, action, column_mapping): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, column_mapping) + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action) return sequencing_files, df_filtered From 96efda1f1db68965e2df9f03eb16d217f7106f12 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 9 Aug 2024 16:08:53 +0100 Subject: [PATCH 29/55] defect fixes --- ait/commons/util/__main__.py | 3 + ait/commons/util/cmd.py | 5 + ait/commons/util/command/submit.py | 40 ++- ait/commons/util/command/submit_file.py | 97 ++++-- ait/commons/util/command/upload.py | 3 +- ait/commons/util/command/view.py | 54 +++ ait/commons/util/spreadsheet_util.py | 436 +++++++++++++----------- 7 files changed, 405 insertions(+), 233 deletions(-) create mode 100644 ait/commons/util/command/view.py diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index a065dec..987197b 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -91,6 +91,9 @@ def parse_args(args): parser_config.add_argument('--action', help='action you want to perform (ADD/MODIFY/DELETE') parser_config.add_argument('--dataset', help='your dataset reference') + parser_config = cmd_parser.add_parser('view', help='view your dataset') + parser_config.add_argument('--dataset', help='your dataset reference') + parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) parser_create.add_argument('DPC', help='center name of the submitter', type=valid_project_name) diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index f96e603..31f0167 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -14,6 +14,7 @@ from ait.commons.util.command.submit_file import CmdSubmitFile from ait.commons.util.command.sync import CmdSync from ait.commons.util.command.upload import CmdUpload +from ait.commons.util.command.view import CmdView from ait.commons.util.local_state import get_bucket, set_attr, get_attr from ait.commons.util.settings import NAME, VERSION from ait.commons.util.user_profile import profile_exists, get_profile @@ -47,6 +48,10 @@ def __init__(self, args): success, msg = CmdSubmitFile(args).run() print(msg) + elif args.command == 'view': + success, msg = CmdView(args).run() + print(msg) + else: if profile_exists(args.profile): self.user_profile = get_profile(args.profile) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 92eb26d..acf81bf 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -88,7 +88,8 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dataset_id, access_token, action): + def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dataset_id, + access_token, action, errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -106,6 +107,8 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") + else: + errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") else: cell_line_entity_id_column_name = "Id" @@ -137,7 +140,7 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, submission_envelope_id, dataset_id, - access_token, action): + access_token, action, errors): """ Handles a single differentiated cell line associated with a given cell line. @@ -159,6 +162,9 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce if success: print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") + else: + errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") else: print("Cell line has differentiated cell lines, creating differentiation process to link them") @@ -232,7 +238,7 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce def handle_library_preparation(self, differentiated_entity_id, library_preparation, library_preparations_df, submission_envelope_id, - dataset_id, access_token, action): + dataset_id, access_token, action, errors): """ Handles a single library preparation associated with a given differentiated cell line. @@ -253,6 +259,9 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati if success: print(f"Updated library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") + else: + errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") else: print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " f"{differentiated_entity_id}") @@ -275,11 +284,11 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati access_token ) - print(f"Linking Library Preparation Biomaterial: {differentiated_entity_id} " + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " f"to dataset: {dataset_id}") self.link_to_dataset('biomaterial', dataset_id, - differentiated_entity_id, access_token) + library_preparation_entity_id, access_token) print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " f"as input to library preparation process") @@ -313,7 +322,8 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati return library_preparation_entity_id def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - sequencing_file_df, submission_envelope_id, dataset_id, access_token, action): + sequencing_file_df, submission_envelope_id, dataset_id, + access_token, action, errors): """ Handles a single sequencing file associated with a given library preparation. @@ -332,6 +342,8 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, if success: print(f"Updated sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + else: + errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") else: print("Creating sequencing process to link the sequencing file") @@ -408,7 +420,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, dataset_id, access_token, - action): + action, + errors): """ Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) @@ -434,7 +447,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, dataset_id, access_token, - action) + action, + errors) for differentiated_cell_line in cell_line.differentiated_cell_lines: differentiated_entity_id = self.handle_differentiated_cell_line(cell_line_entity_id, @@ -443,7 +457,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, dataset_id, access_token, - action) + action, + errors) for library_preparation in differentiated_cell_line.library_preparations: library_preparation_entity_id = self.handle_library_preparation(differentiated_entity_id, @@ -452,7 +467,8 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, dataset_id, access_token, - action) + action, + errors) for sequencing_file in library_preparation.sequencing_files: self.handle_sequencing_file(library_preparation_entity_id, @@ -461,11 +477,13 @@ def multi_type_submission(self, cell_lines, cell_lines_df, submission_envelope_id, dataset_id, access_token, - action) + action, + errors) message = 'SUCCESS' except Exception as e: message = f"An error occurred: {str(e)}" + errors.append(message) traceback.print_exc() # Set DataFrames to None in case of an error cell_lines_df = None diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index b7c8452..77223e1 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,11 +1,17 @@ # Import necessary modules/classes from ait.commons.util package +import os +import sys + import pandas as pd from ait.commons.util.aws_client import Aws from ait.commons.util.command.list import CmdList from ait.commons.util.command.submit import CmdSubmit, get_id_from_url +from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider -from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter +from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ + merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ + merge_differentiated_cell_line_and_library_preparation # Define a class for handling submission of a command file @@ -42,7 +48,8 @@ def __init__(self, args): self.user_profile = get_profile('morphic-util') self.aws = Aws(self.user_profile) self.provider_api = APIProvider(self.base_url) - self.errors = [] + self.validation_errors = [] + self.submission_errors = [] if hasattr(self.args, 'action') and self.args.action is not None: self.action = self.args.action @@ -82,6 +89,7 @@ def run(self): return True, None list_instance = CmdList(self.aws, self.args) + upload_instance = CmdUpload(self.aws, self.args) list_of_files_in_upload_area = list_instance.list_bucket_contents_and_return(self.dataset, '') if self.file: @@ -89,21 +97,33 @@ def run(self): parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.errors) + cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.validation_errors) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action) - parser.merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.errors) + 'Differentiated cell line', self.action, self.validation_errors) + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, + self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( - 'Library preparation', self.action) - parser.merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, - library_preparations) + 'Library preparation', self.action, self.validation_errors) + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, + library_preparations, self.validation_errors) sequencing_files, sequencing_files_df = parser.get_sequencing_files( - 'Sequence file', self.action) + 'Sequence file', self.action, self.validation_errors) # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) - parser.merge_library_preparation_sequencing_file(library_preparations, sequencing_files) - submission_envelope_id = None + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, + self.validation_errors) + + try: + if len(self.validation_errors) > 0: + raise ValidationError(self.validation_errors) + else: + print(f"File {self.file} is validated successfully. Initiating submission") + print(f"File {self.file} being uploaded to storage") + upload_instance.upload_file(self.dataset, self.file, os.path.basename(self.file)) + except ValidationError as e: + print(e) + sys.exit(1) if self.action == 'add' or self.action == 'ADD': submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( @@ -131,25 +151,52 @@ def run(self): ) = submission_instance.multi_type_submission( cell_lines, cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_files_df, submission_envelope_id, - self.dataset, self.access_token, self.action + self.dataset, self.access_token, self.action, self.submission_errors ) # Save the updated dataframes to a single Excel file with multiple sheets if message == 'SUCCESS': output_file = "submission-result.xlsx" - with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) - updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', - index=False) - updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', index=False) - updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) - - return True, message + try: + # Write to Excel file + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) + updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', + index=False) + updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', + index=False) + updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) + + # Confirm file was written and path exists + if os.path.exists(output_file): + # Attempt file upload + upload_instance.upload_file(self.dataset, output_file, os.path.basename(output_file)) + print(f"File {output_file} uploaded successfully.") + else: + raise FileNotFoundError( + f"The output file {output_file} was not created or cannot be found.") + + except Exception as e: + print(f"Failed to upload file {output_file}. Error: {e}") + return True, "SUBMISSION IS SUCCESSFUL." else: - print("Submission has failed, rolling back") - submission_instance.delete_submission(submission_envelope_id, self.access_token, True) - return False, "Submission has failed, rolled back" + return self.delete_actions(submission_envelope_id, + submission_instance, + None) except Exception as e: - print("Submission has failed, rolling back") - submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + return self.delete_actions(submission_envelope_id, submission_instance, e) + + def delete_actions(self, submission_envelope_id, submission_instance, e): + try: + print("SUBMISSION failed, rolling back") + print("SUBMISSION ERRORS are:") + print("\n".join(self.submission_errors)) + submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + submission_instance.delete_dataset(self.dataset, self.access_token) + + if e is None: + return False, "Submission has failed, rolled back" + else: return False, f"An error occurred: {str(e)}" + except Exception as e: + print(f"Failed to rollback submission {submission_envelope_id}") diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index 6ec848c..50eb4ea 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -22,9 +22,10 @@ def __init__(self, aws, args): def upload_file(self, selected_area, data_file, destination_file): + overwrite = getattr(self.args, 'o', False) file_size = os.path.getsize(data_file) - if not self.args.o and self.aws.data_file_exists(selected_area, destination_file): + if not overwrite and self.aws.data_file_exists(selected_area, destination_file): print(f"{destination_file} already exists. Use -o to overwrite.") elif file_size == 0: diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py new file mode 100644 index 0000000..adf1a89 --- /dev/null +++ b/ait/commons/util/command/view.py @@ -0,0 +1,54 @@ +from ait.commons.util.aws_client import Aws +from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.user_profile import get_profile + + +class CmdView: + base_url = 'https://api.ingest.dev.archive.morphic.bio' + + def __init__(self, args): + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.user_profile = get_profile('morphic-util') + self.provider_api = APIProvider(self.base_url) + + if hasattr(self.args, 'dataset') and self.args.dataset is not None: + self.dataset = self.args.dataset + else: + print("Dataset is mandatory for view") + + def run(self): + fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{self.dataset}", + self.access_token) + print(f"Dataset fetched successfully: {self.dataset}") + print("Getting Biomaterials") + biomaterials = fetched_dataset.get('biomaterials', []) + + for biomaterial in biomaterials: + print(biomaterial) + + fetched_biomaterial = self.provider_api.get_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", + self.access_token) + print(fetched_biomaterial) + + print("Getting Processes") + processes = fetched_dataset.get('processes', []) + + for process in processes: + print(process) + + fetched_process = self.provider_api.get_to_provider_api(f"{self.base_url}/processes/{process}", + self.access_token) + print(fetched_process) + + print("Getting Data Files") + files = fetched_dataset.get('files', []) + + for file in files: + print(files) + + fetched_file = self.provider_api.get_to_provider_api(f"{self.base_url}/files/{file}", + self.access_token) + print(fetched_file) + + return True, "FETCHED SUCCESSFULLY" diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 5dd4519..ce48b05 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -2,29 +2,39 @@ import json import numpy as np - +""" class MissingMandatoryFieldError(Exception): def __init__(self, message): self.message = message super().__init__(self.message) +""" + + +class MissingParentEntityError: -class MissingParentEntityError(Exception): - """Custom exception raised when an expected entity is missing.""" + def add_error(self, missing_type, entity_type, missing_id, errors): + errors.append(f"Missing {missing_type} for {entity_type} and ID is {missing_id}") - def __init__(self, missing_type, entity_type, missing_id): - super().__init__(f"Missing {missing_type} for {entity_type} and ID is {missing_id}") - self.entity_type = entity_type - self.missing_type = missing_type - self.missing_id = missing_id +class ValidationError(Exception): + def __init__(self, errors): + self.errors = errors + super().__init__("Validation errors occurred") + def __str__(self): + return "\n".join(self.errors) + + +""" class OrphanedEntityError(Exception): def __init__(self, type, id): super().__init__(f"Orphaned entity {type} and ID is {id}") self.type = type self.id = id +""" + class CellLine: def __init__(self, biomaterial_id, description, derived_accession, @@ -188,6 +198,180 @@ def init_file(self): pass +def find_orphans(source_entities, target_entities, + source_attr, target_attr, source_type, target_type, errors): + """ + Validates that each source entity has a corresponding target entity. + + Parameters: + source_entities (list): The list of source entities. + target_entities (list): The list of target entities. + source_attr (str): The attribute name in the source entity to compare. + target_attr (str): The attribute name in the target entity to compare. + source_type (str): The type name of the source entity (for error messages). + target_type (str): The type name of the target entity (for error messages). + + Raises: + OrphanedEntityError: If a source entity doesn't have a corresponding target entity. + """ + for source_entity in source_entities: + match_found = False + + for target_entity in target_entities: + if getattr(target_entity, target_attr) == getattr(source_entity, source_attr): + match_found = True + break + + if not match_found: + errors.append(f"Orphaned entity {source_type} and ID is {getattr(source_entity, source_attr)}") + # raise OrphanedEntityError(source_type, getattr(source_entity, source_attr)) + + # print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") + + +def merge_library_preparation_sequencing_file(library_preparations, sequencing_files, errors): + """ + Merges library preparations and sequencing files based on their IDs. + + Parameters: + ----------- + library_preparations : list + A list of LibraryPreparation objects to be merged. + sequencing_files : list + A list of SequencingFile objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a sequencing file does not have a corresponding library preparation. + """ + find_orphans( + source_entities=library_preparations, + target_entities=sequencing_files, + source_attr="biomaterial_id", # Assuming this is the correct attribute + target_attr="library_preparation_id", + source_type="Library Preparation", + target_type="Sequencing File", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} + + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id not in library_ids: + missing_parent_entity_error.add_error("Library Preparation", + "Sequencing File", + sequencing_file.file_name, + errors) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: + library_preparation.add_sequencing_file(sequencing_file) + + +def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, + errors): + """ + Merges differentiated cell lines and library preparations based on their biomaterial IDs. + + Parameters: + ----------- + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + library_preparations : list + A list of LibraryPreparation objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a library preparation does not have a corresponding differentiated cell line. + """ + + find_orphans( + source_entities=differentiated_cell_lines, + target_entities=library_preparations, + source_attr="biomaterial_id", + target_attr="differentiated_biomaterial_id", + source_type="Differentiated Cell line", + target_type="Library Preparation", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + + differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} + + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id not in differentiated_ids: + missing_parent_entity_error.add_error("Differentiated Cell Line", + "Library Preparation", + library_preparation.biomaterial_id, + errors) + + for differentiated_cell_line in differentiated_cell_lines: + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id == differentiated_cell_line.biomaterial_id: + differentiated_cell_line.add_library_preparation(library_preparation) + + +def merge_cell_line_and_differentiated_cell_line(cell_lines, + differentiated_cell_lines, errors): + """ + Merges cell lines and differentiated cell lines based on their biomaterial IDs. + + Parameters: + ----------- + cell_lines : list + A list of CellLine objects to be merged. + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a differentiated cell line does not have a corresponding cell line. + """ + + find_orphans( + source_entities=cell_lines, + target_entities=differentiated_cell_lines, + source_attr="biomaterial_id", + target_attr="input_biomaterial_id", + source_type="Cell line", + target_type="Differentiated Cell line", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} + + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: + missing_parent_entity_error.add_error("Cell Line", + "Differentiated Cell line", + differentiated_cell_line.biomaterial_id, + errors) + + for cell_line in cell_lines: + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.input_biomaterial_id == cell_line.biomaterial_id: + cell_line.add_differentiated_cell_line(differentiated_cell_line) + + class SpreadsheetSubmitter: """ A class for parsing and processing data from an Excel spreadsheet containing information about @@ -292,7 +476,9 @@ def parse_cell_lines(self, sheet_name, action, errors): # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: - raise KeyError("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist.") + # raise KeyError("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist.") + errors.append("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in Cell line sheet. " + "The rest of the file will not be processed") # Filter rows where biomaterial_id is not null df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] @@ -319,15 +505,19 @@ def parse_cell_lines(self, sheet_name, action, errors): # Check if biomaterial_id is null if pd.isnull(biomaterial_id): - errors.append("Biomaterial ID cannot be null.") - raise MissingMandatoryFieldError("Biomaterial ID cannot be null.") + errors.append("Biomaterial ID cannot be null in any row of the Cell line sheet.") + # raise MissingMandatoryFieldError("Biomaterial ID cannot be null.") # Check if derived_accession and cell_type are present if pd.isnull(derived_accession) or pd.isnull(cell_type): - errors.append(f"Mandatory fields (derived_accession, cell_type) are required. {biomaterial_id}") + errors.append(f"Mandatory fields (derived_accession, cell_type) are required for Cell line entity: " + f"{biomaterial_id}") + """ raise MissingMandatoryFieldError( - f"Mandatory fields (derived_accession, cell_type) are required. {biomaterial_id}") + f"Mandatory fields (derived_accession, cell_type) are required. " + f"{biomaterial_id}") + """ cell_lines.append( CellLine( @@ -344,7 +534,7 @@ def parse_cell_lines(self, sheet_name, action, errors): return cell_lines, df_filtered - def parse_differentiated_cell_lines(self, sheet_name, action): + def parse_differentiated_cell_lines(self, sheet_name, action, errors): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -374,7 +564,8 @@ def parse_differentiated_cell_lines(self, sheet_name, action): # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: - raise KeyError("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not exist.") + errors.append("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + "exist.") # Filter rows where biomaterial_id is not null df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] @@ -400,12 +591,18 @@ def parse_differentiated_cell_lines(self, sheet_name, action): # Check if biomaterial_id is null if pd.isnull(differentiated_biomaterial_id): - raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null.") + errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " + "sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") # Check if derived_accession and cell_type are present if pd.isnull(biomaterial_id): + errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " + f"{differentiated_biomaterial_id}") + """ raise MissingMandatoryFieldError( "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + """ # Create DifferentiatedCellLine objects from filtered DataFrame rows differentiated_cell_lines.append( @@ -424,7 +621,7 @@ def parse_differentiated_cell_lines(self, sheet_name, action): return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, action): + def parse_library_preparations(self, sheet_name, action, errors): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -459,7 +656,7 @@ def parse_library_preparations(self, sheet_name, action): ] for col in required_columns: if col not in df.columns: - raise KeyError(f"The column '{col}' does not exist.") + errors.append(f"The column '{col}' does not exist in the Library Preparation sheet.") # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] @@ -487,13 +684,18 @@ def parse_library_preparations(self, sheet_name, action): # Check if required fields are null if pd.isnull(library_preparation_id): - raise MissingMandatoryFieldError("Library Preparation ID cannot be null.") + errors.append("Library Preparation ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") if pd.isnull(dissociation_protocol_id): - raise MissingMandatoryFieldError("Dissociation Protocol ID cannot be null.") + errors.append("Dissociation Protocol ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Dissociation Protocol ID cannot be null in any row.") if pd.isnull(differentiated_biomaterial_id): - raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null.") + errors.append("Differentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") if pd.isnull(library_preparation_protocol_id): - raise MissingMandatoryFieldError("Library Preparation Protocol ID cannot be null.") + errors.append( + "Library Preparation Protocol ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Library Preparation Protocol ID cannot be null in any row.") # Create LibraryPreparation objects from filtered DataFrame rows library_preparations.append( @@ -517,7 +719,7 @@ def parse_library_preparations(self, sheet_name, action): return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, action): + def parse_sequencing_files(self, sheet_name, action, errors): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -552,7 +754,7 @@ def parse_sequencing_files(self, sheet_name, action): ] for col in required_columns: if col not in df.columns: - raise KeyError(f"The column '{col}' does not exist.") + errors.append(f"The column '{col}' does not exist in the Sequencing File sheet.") # Filter rows where file_name is not null df = df[df['sequence_file.file_core.file_name'].notna()] @@ -581,13 +783,17 @@ def parse_sequencing_files(self, sheet_name, action): # Check if required fields are null if pd.isnull(file_name): - raise MissingMandatoryFieldError("Sequence file name cannot be null.") + errors.append("Sequence file name cannot be null in any row of the Sequencing File sheet.") + # raise MissingMandatoryFieldError("Sequence file name cannot be null in any row.") if pd.isnull(library_preparation_id): - raise MissingMandatoryFieldError("Library Preparation ID cannot be null.") + errors.append("Library Preparation ID cannot be null in any row of the Sequencing File sheet..") + # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") if pd.isnull(sequencing_protocol_id): - raise MissingMandatoryFieldError("Sequencing Protocol ID cannot be null.") + errors.append("Sequencing Protocol ID cannot be null in any row of the Sequencing File sheet..") + # raise MissingMandatoryFieldError("Sequencing Protocol ID cannot be null in any row.") if pd.isnull(read_index): - raise MissingMandatoryFieldError("Read Index cannot be null.") + errors.append("Read Index cannot be null in any row of the Sequencing File sheet..") + # raise MissingMandatoryFieldError("Read Index cannot be null in any row.") # Create SequencingFile objects from filtered DataFrame rows sequencing_files.append( @@ -622,7 +828,7 @@ def get_cell_lines(self, sheet_name, action, errors): cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action, errors) return cell_lines, cell_lines_df - def get_differentiated_cell_lines(self, sheet_name, action): + def get_differentiated_cell_lines(self, sheet_name, action, errors): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -640,172 +846,10 @@ def get_differentiated_cell_lines(self, sheet_name, action): """ differentiated_cell_lines, differentiated_cell_lines_df = (self. parse_differentiated_cell_lines - (sheet_name, action)) + (sheet_name, action, errors)) return differentiated_cell_lines, differentiated_cell_lines_df - def merge_cell_line_and_differentiated_cell_line(self, cell_lines, - differentiated_cell_lines, errors): - """ - Merges cell lines and differentiated cell lines based on their biomaterial IDs. - - Parameters: - ----------- - cell_lines : list - A list of CellLine objects to be merged. - differentiated_cell_lines : list - A list of DifferentiatedCellLine objects to be merged. - - Returns: - -------- - None - - Raises: - ------ - MissingEntityError: - If a differentiated cell line does not have a corresponding cell line. - """ - - self.find_orphans( - source_entities=cell_lines, - target_entities=differentiated_cell_lines, - source_attr="biomaterial_id", - target_attr="input_biomaterial_id", - source_type="Cell line", - target_type="Differentiated Cell line", - errors=errors - ) - - cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} - - for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: - raise MissingParentEntityError("Cell Line", - "Differentiated cell line", - differentiated_cell_line.biomaterial_id) - - for cell_line in cell_lines: - for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.input_biomaterial_id == cell_line.biomaterial_id: - cell_line.add_differentiated_cell_line(differentiated_cell_line) - - def merge_differentiated_cell_line_and_library_preparation(self, differentiated_cell_lines, library_preparations): - """ - Merges differentiated cell lines and library preparations based on their biomaterial IDs. - - Parameters: - ----------- - differentiated_cell_lines : list - A list of DifferentiatedCellLine objects to be merged. - library_preparations : list - A list of LibraryPreparation objects to be merged. - - Returns: - -------- - None - - Raises: - ------ - MissingEntityError: - If a library preparation does not have a corresponding differentiated cell line. - """ - - self.find_orphans( - source_entities=differentiated_cell_lines, - target_entities=library_preparations, - source_attr="biomaterial_id", - target_attr="differentiated_biomaterial_id", - source_type="Differentiated Cell line", - target_type="Library Preparation", - errors=[] - ) - - differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} - - for library_preparation in library_preparations: - if library_preparation.differentiated_biomaterial_id not in differentiated_ids: - raise MissingParentEntityError("Differentiated Cell Line", - "Library preparation", - library_preparation.biomaterial_id) - - for differentiated_cell_line in differentiated_cell_lines: - for library_preparation in library_preparations: - if library_preparation.differentiated_biomaterial_id == differentiated_cell_line.biomaterial_id: - differentiated_cell_line.add_library_preparation(library_preparation) - - def merge_library_preparation_sequencing_file(self, library_preparations, sequencing_files): - """ - Merges library preparations and sequencing files based on their IDs. - - Parameters: - ----------- - library_preparations : list - A list of LibraryPreparation objects to be merged. - sequencing_files : list - A list of SequencingFile objects to be merged. - - Returns: - -------- - None - - Raises: - ------ - MissingEntityError: - If a sequencing file does not have a corresponding library preparation. - """ - self.find_orphans( - source_entities=library_preparations, - target_entities=sequencing_files, - source_attr="biomaterial_id", # Assuming this is the correct attribute - target_attr="library_preparation_id", - source_type="Library Preparation", - target_type="Sequencing File", - errors=[] - ) - - library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} - - for sequencing_file in sequencing_files: - if sequencing_file.library_preparation_id not in library_ids: - raise MissingParentEntityError("Library preparation", - "Sequencing file", - sequencing_file.file_name) - - for library_preparation in library_preparations: - for sequencing_file in sequencing_files: - if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: - library_preparation.add_sequencing_file(sequencing_file) - - def find_orphans(self, source_entities, target_entities, - source_attr, target_attr, source_type, target_type, errors): - """ - Validates that each source entity has a corresponding target entity. - - Parameters: - source_entities (list): The list of source entities. - target_entities (list): The list of target entities. - source_attr (str): The attribute name in the source entity to compare. - target_attr (str): The attribute name in the target entity to compare. - source_type (str): The type name of the source entity (for error messages). - target_type (str): The type name of the target entity (for error messages). - - Raises: - OrphanedEntityError: If a source entity doesn't have a corresponding target entity. - """ - for source_entity in source_entities: - match_found = False - - for target_entity in target_entities: - if getattr(target_entity, target_attr) == getattr(source_entity, source_attr): - match_found = True - break - - if not match_found: - errors.append(source_type + getattr(source_entity, source_attr)) - raise OrphanedEntityError(source_type, getattr(source_entity, source_attr)) - - print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") - - def get_library_preparations(self, sheet_name, action): + def get_library_preparations(self, sheet_name, action, errors): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -822,10 +866,10 @@ def get_library_preparations(self, sheet_name, action): A list of LibraryPreparation objects parsed from the specified sheet. """ library_preparations, df_filtered = self.parse_library_preparations(sheet_name, - action) + action, errors) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, action): + def get_sequencing_files(self, sheet_name, action, errors): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -841,5 +885,5 @@ def get_sequencing_files(self, sheet_name, action): list A list of SequencingFile objects parsed from the specified sheet. """ - sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action) + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, errors) return sequencing_files, df_filtered From 82149dd1b675b89b43a03a0ff291d674308a7247 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 13 Aug 2024 10:27:18 +0100 Subject: [PATCH 30/55] adding expression alteration support --- ait/commons/util/command/submit_file.py | 84 ++++++-- ait/commons/util/spreadsheet_util.py | 267 ++++++++++++++++-------- 2 files changed, 254 insertions(+), 97 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 77223e1..dfe2d3c 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -31,10 +31,38 @@ def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, da ) +def get_content(unique_value): + return {"content": unique_value} + + +def create_expression_alterations(submission_instance, submission_envelope_id, access_token, expression_alterations): + for expression_alteration in expression_alterations: + # Submit the expression alteration and retrieve the ID + expression_alteration_id = submission_instance.use_existing_envelope_and_submit_entity( + 'process', + expression_alteration.to_dict(), # Convert the object to a dictionary for submission + submission_envelope_id, + access_token + ) + # Set the retrieved ID in the ExpressionAlterationStrategy object + expression_alteration.id = expression_alteration_id + + return expression_alterations + + +def link_cell_line_parent_cell_line_expression_alretation(submission_instance, + submission_envelope_id, + access_token, + parent_cell_line_id, + created_expression_alterations, + cell_lines): + pass + + class CmdSubmitFile: - base_url = 'https://api.ingest.dev.archive.morphic.bio' - submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" - submission_envelope_base_url = f"{base_url}/submissionEnvelopes" + BASE_URL = 'https://api.ingest.dev.archive.morphic.bio' + SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" + SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" def __init__(self, args): """ @@ -47,7 +75,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.user_profile = get_profile('morphic-util') self.aws = Aws(self.user_profile) - self.provider_api = APIProvider(self.base_url) + self.provider_api = APIProvider(self.BASE_URL) self.validation_errors = [] self.submission_errors = [] @@ -83,13 +111,12 @@ def run(self): """ submission_instance = CmdSubmit(self) - if self.action == 'delete' or self.action == 'DELETE': + if self.action.lower() == 'delete': self.file = None submission_instance.delete_dataset(self.dataset, self.access_token) return True, None list_instance = CmdList(self.aws, self.args) - upload_instance = CmdUpload(self.aws, self.args) list_of_files_in_upload_area = list_instance.list_bucket_contents_and_return(self.dataset, '') if self.file: @@ -97,7 +124,11 @@ def run(self): parser = SpreadsheetSubmitter(self.file) # Parse different sections of the spreadsheet using defined column mappings - cell_lines, cell_lines_df = parser.get_cell_lines('Cell line', self.action, self.validation_errors) + expression_alterations, expression_alterations_df = (parser.get_expression_alterations + ('Expression alteration strategy', + self.action, self.validation_errors)) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines('Cell line', self.action, + self.validation_errors) differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( 'Differentiated cell line', self.action, self.validation_errors) merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, @@ -113,6 +144,7 @@ def run(self): merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + upload_instance = CmdUpload(self.aws, self.args) try: if len(self.validation_errors) > 0: @@ -125,11 +157,11 @@ def run(self): print(e) sys.exit(1) - if self.action == 'add' or self.action == 'ADD': + if self.action.lower() == 'add': submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( - self.submission_envelope_create_url, access_token=self.access_token + self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token ) - if status_code == 200 or status_code == 201: + if status_code in (200, 201): self_url = submission_envelope_response['_links']['self']['href'] submission_envelope_id = get_id_from_url(self_url) print(f"Submission envelope for this submission is: {submission_envelope_id}") @@ -138,12 +170,31 @@ def run(self): message = "Unauthorized, refresh your access token using the config option" return False, message else: - return False, f"Encountered failure with {status_code}" + return False, f"Encountered failure with status code {status_code}" else: submission_envelope_id = None # Perform the submission and get the updated dataframes try: + parent_cell_line_id = (submission_instance. + use_existing_envelope_and_submit_entity('biomaterial', + get_content( + parent_cell_line_name), + submission_envelope_id, + self.access_token)) + + created_expression_alterations = create_expression_alterations(submission_instance, + submission_envelope_id, + self.access_token, + expression_alterations) + + link_cell_line_parent_cell_line_expression_alretation(submission_instance, + submission_envelope_id, + self.access_token, + parent_cell_line_id, + created_expression_alterations, + cell_lines) + # submission now ( updated_cell_lines_df, updated_differentiated_cell_lines_df, updated_library_preparations_df, updated_sequencing_files_df, @@ -177,7 +228,9 @@ def run(self): f"The output file {output_file} was not created or cannot be found.") except Exception as e: - print(f"Failed to upload file {output_file}. Error: {e}") + print( + f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for " + f"tracing metadata") return True, "SUBMISSION IS SUCCESSFUL." else: return self.delete_actions(submission_envelope_id, @@ -188,8 +241,9 @@ def run(self): def delete_actions(self, submission_envelope_id, submission_instance, e): try: - print("SUBMISSION failed, rolling back") - print("SUBMISSION ERRORS are:") + print("SUBMISSION has failed, rolling back") + print("SUBMISSION ERRORS are listed below. Any metadata created will be deleted now, please wait until " + "the clean-up finishes") print("\n".join(self.submission_errors)) submission_instance.delete_submission(submission_envelope_id, self.access_token, True) submission_instance.delete_dataset(self.dataset, self.access_token) @@ -199,4 +253,4 @@ def delete_actions(self, submission_envelope_id, submission_instance, e): else: return False, f"An error occurred: {str(e)}" except Exception as e: - print(f"Failed to rollback submission {submission_envelope_id}") + print(f"Failed to rollback submission {submission_envelope_id}") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index ce48b05..e6392fe 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -37,16 +37,17 @@ def __init__(self, type, id): class CellLine: - def __init__(self, biomaterial_id, description, derived_accession, - clone_id, protocol_id, zygosity, cell_type, id): + def __init__(self, biomaterial_id, description, derived_from_accession, + clone_id, protocol_id, zygosity, cell_type, expression_alteration_id, id): self.biomaterial_id = biomaterial_id self.description = description - self.derived_accession = derived_accession + self.derived_from_accession = derived_from_accession self.clone_id = clone_id self.protocol_id = protocol_id self.zygosity = zygosity self.cell_type = cell_type self.differentiated_cell_lines = [] + self.expression_alteration_id = expression_alteration_id self.id = id def add_differentiated_cell_line(self, differentiated_cell_line): @@ -60,11 +61,51 @@ def to_dict(self): "content": { "biomaterial_id": self.biomaterial_id, "description": self.description, - "derived_accession": self.derived_accession, + "derived_from_accession": self.derived_from_accession, "clone_id": self.clone_id, "protocol_id": self.protocol_id, "zygosity": self.zygosity, - "cell_type": self.cell_type + "cell_type": self.cell_type, + "expression_alteration_id": self.expression_alteration_id + } + } + + +class ExpressionAlterationStrategy: + def __init__(self, expression_alteration_id, protocol_id, allele_specific, altered_gene_symbols, altered_gene_ids, + targeted_genomic_region, expected_alteration_type, sgrna_target, + protocol_method_text, altered_locus, guide_sequence, id): + self.expression_alteration_id = expression_alteration_id + self.protocol_id = protocol_id + self.allele_specific = allele_specific + self.altered_gene_symbols = altered_gene_symbols + self.altered_gene_ids = altered_gene_ids + self.targeted_genomic_region = targeted_genomic_region + self.expected_alteration_type = expected_alteration_type + self.sgrna_target = sgrna_target + self.protocol_method_text = protocol_method_text + self.altered_locus = altered_locus + self.guide_sequence = guide_sequence + self.id = id + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "expression_alteration_id": self.expression_alteration_id, + "protocol_id": self.protocol_id, + "allele_specific": self.allele_specific, + "altered_gene_symbols": self.altered_gene_symbols, + "altered_gene_ids": self.altered_gene_ids, + "targeted_genomic_region": self.targeted_genomic_region, + "expected_alteration_type": self.expected_alteration_type, + "sgrna_target": self.sgrna_target, + "protocol_method_text": self.protocol_method_text, + "altered_locus": self.altered_locus, + "guide_sequence": self.guide_sequence, + "id": self.id } } @@ -444,6 +485,29 @@ def list_sheets(self): xls = pd.ExcelFile(self.file_path, engine='openpyxl') return xls.sheet_names + def input_file_to_data_frames(self, sheet_name, action): + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + # Load the Excel file to retrieve all sheet names + with pd.ExcelFile(self.file_path, engine='openpyxl') as xls: + # Trim spaces from sheet names + sheet_names = {sheet.strip(): sheet for sheet in xls.sheet_names} + + # Attempt to find the trimmed sheet name in the list + trimmed_sheet_name = sheet_name.strip() + + if trimmed_sheet_name in sheet_names: + # Read the sheet using the original sheet name (with spaces if they existed) + df = pd.read_excel(self.file_path, sheet_name=sheet_names[trimmed_sheet_name], engine='openpyxl', + skiprows=skip_rows) + else: + raise ValueError(f"Sheet '{sheet_name}' not found in the spreadsheet.") + + return df + def parse_cell_lines(self, sheet_name, action, errors): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -452,8 +516,6 @@ def parse_cell_lines(self, sheet_name, action, errors): ----------- sheet_name : str The name of the sheet containing cell line data. - column_mapping : dict - A dictionary mapping column names in the sheet to expected attribute names. Returns: -------- @@ -462,77 +524,74 @@ def parse_cell_lines(self, sheet_name, action, errors): - list of CellLine objects parsed from the specified sheet. - pd.DataFrame with the parsed data. """ - if action.upper() == 'MODIFY': - skip_rows = 0 - else: - skip_rows = 3 - - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) df.columns = df.columns.str.strip() - # df = df.rename(columns=column_mapping) - - # Remove unnamed columns (columns without headers) - # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: - # raise KeyError("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist.") - errors.append("The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in Cell line sheet. " - "The rest of the file will not be processed") + errors.append( + "The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the Cell line sheet. " + "The rest of the file will not be processed") + return [], df # Filter rows where biomaterial_id is not null df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] - + # Replace invalid float values with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - - # Define columns to check for values starting with 'ABC' or 'XYZ' + # Define columns to check for invalid starting values cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] + invalid_start_values = ( + 'FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'cell_line.biomaterial_core.biomaterial_id' + ) + # Filter out rows with invalid starting values + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith(invalid_start_values)).all(axis=1) + df_filtered = df[mask] + # Check for a unique value in 'cell_line.derived_cell_line_accession' + derived_col = 'cell_line.derived_cell_line_accession' - # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' - mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( - ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', - 'cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + if derived_col in df_filtered.columns: + parent_cell_line_names = df_filtered[derived_col].dropna().unique() - # Apply the mask to filter out rows - df_filtered = df[mask] + if len(parent_cell_line_names) != 1: + errors.append( + f"The column '{derived_col}' must have the same value across all rows. Found values: {parent_cell_line_names}") - # Check for mandatory fields and create CellLine objects + return [], df + + # Process rows to create CellLine objects cell_lines = [] + for _, row in df_filtered.iterrows(): biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] - derived_accession = row.get('cell_line.derived_cell_line_accession') + derived_from_accession = row.get('cell_line.derived_cell_line_accession') cell_type = row.get('cell_line.type') + # expression_alteration_id = row.get('expression_alteration_id') - # Check if biomaterial_id is null + # Error handling for missing mandatory fields if pd.isnull(biomaterial_id): errors.append("Biomaterial ID cannot be null in any row of the Cell line sheet.") - # raise MissingMandatoryFieldError("Biomaterial ID cannot be null.") - # Check if derived_accession and cell_type are present - if pd.isnull(derived_accession) or pd.isnull(cell_type): - errors.append(f"Mandatory fields (derived_accession, cell_type) are required for Cell line entity: " - f"{biomaterial_id}") - - """ - raise MissingMandatoryFieldError( - f"Mandatory fields (derived_accession, cell_type) are required. " - f"{biomaterial_id}") - """ + if any(pd.isnull(field) for field in [derived_from_accession, cell_type]): + errors.append( + f"Mandatory fields (derived_accession, cell_type, expression_alteration_id) are required for Cell " + f"line entity: {biomaterial_id}") cell_lines.append( CellLine( biomaterial_id=biomaterial_id, description=row.get('cell_line.biomaterial_core.biomaterial_description'), - derived_accession=derived_accession, + derived_from_accession=derived_from_accession, clone_id=row.get('cell_line.clone_id'), protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), zygosity=row.get('cell_line.zygosity'), cell_type=cell_type, + expression_alteration_id=None, id=row.get('Id') ) ) - return cell_lines, df_filtered + return cell_lines, df_filtered, parent_cell_line_names[0] def parse_differentiated_cell_lines(self, sheet_name, action, errors): """ @@ -550,15 +609,9 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - if action.upper() == 'MODIFY': - skip_rows = 0 - else: - skip_rows = 3 - - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) df.columns = df.columns.str.strip() # df = df.rename(columns=column_mapping) - # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] @@ -566,25 +619,22 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " "exist.") + return [], df # Filter rows where biomaterial_id is not null df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] - df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] - # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) - # Apply the mask to filter out rows df_filtered = df[mask] - # Check for mandatory fields and create Differentiated CellLine objects differentiated_cell_lines = [] + for _, row in df_filtered.iterrows(): differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') @@ -635,18 +685,11 @@ def parse_library_preparations(self, sheet_name, action, errors): list A list of LibraryPreparation objects parsed from the specified sheet. """ - if action.upper() == 'MODIFY': - skip_rows = 0 - else: - skip_rows = 3 - - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) df.columns = df.columns.str.strip() # df = df.rename(columns=column_mapping) - # Remove unnamed columns (columns without headers) # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] - # Check if the required column exists required_columns = [ 'library_preparation.biomaterial_core.biomaterial_id', @@ -654,28 +697,27 @@ def parse_library_preparations(self, sheet_name, action, errors): 'differentiated_cell_line.biomaterial_core.biomaterial_id', 'library_preparation_protocol.protocol_core.protocol_id' ] + for col in required_columns: if col not in df.columns: errors.append(f"The column '{col}' does not exist in the Library Preparation sheet.") + return [], df + # Filter rows where biomaterial_id is not null df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] - df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['library_preparation.biomaterial_core.biomaterial_id'] - # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', 'library_preparation.biomaterial_core.biomaterial_id'))).all(axis=1) - # Apply the mask to filter out rows df_filtered = df[mask] - # Check for mandatory fields and create Library Preparation objects library_preparations = [] + for _, row in df_filtered.iterrows(): library_preparation_id = row['library_preparation.biomaterial_core.biomaterial_id'] dissociation_protocol_id = row.get('dissociation_protocol.protocol_core.protocol_id') @@ -733,12 +775,7 @@ def parse_sequencing_files(self, sheet_name, action, errors): list A list of SequencingFile objects parsed from the specified sheet. """ - if action.upper() == 'MODIFY': - skip_rows = 0 - else: - skip_rows = 3 - - df = pd.read_excel(self.file_path, sheet_name=sheet_name, engine='openpyxl', skiprows=skip_rows) + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) df.columns = df.columns.str.strip() # df = df.rename(columns=column_mapping) @@ -752,29 +789,29 @@ def parse_sequencing_files(self, sheet_name, action, errors): 'sequencing_protocol.protocol_core.protocol_id', 'sequence_file.read_index' ] + for col in required_columns: if col not in df.columns: errors.append(f"The column '{col}' does not exist in the Sequencing File sheet.") + return [], df + # Filter rows where file_name is not null df = df[df['sequence_file.file_core.file_name'].notna()] - df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Define columns to check for values starting with 'ABC' or 'XYZ' cols_to_check = ['sequence_file.file_core.file_name'] - # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'The name of the file.', 'Include the file extension in the file name. For example: R1.fastq.gz; codebook.json', 'sequence_file.file_core.file_name'))).all(axis=1) - # Apply the mask to filter out rows df_filtered = df[mask] # Check for mandatory fields and create Sequencing file objects sequencing_files = [] + for _, row in df_filtered.iterrows(): file_name = row['sequence_file.file_core.file_name'] library_preparation_id = row.get('library_preparation.biomaterial_core.biomaterial_id') @@ -809,6 +846,68 @@ def parse_sequencing_files(self, sheet_name, action, errors): return sequencing_files, df_filtered + def parse_expression_alteration(self, sheet_name, action, errors): + """ + Parses data related to expression alterations from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing expression alterations data. + + Returns: + -------- + list + A list of ExpressionAlterationStrategy objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # Check if the required column exists + required_columns = ['expression_alteration_id'] + missing_columns = [col for col in required_columns if col not in df.columns] + + if missing_columns: + errors.append( + f"The following required columns are missing in the Expression Alteration Strategy sheet: {', '.join(missing_columns)}") + return [], df # Return early if required columns are missing + + # Filter rows where 'expression_alteration_id' is not null + df = df[df['expression_alteration_id'].notna()] + # Replace invalid float values (e.g., NaN, infinite) with None + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define unwanted patterns + unwanted_patterns = ( + 'FILL OUT INFORMATION BELOW THIS ROW', + 'A unique ID for the gene expression alteration instance..', + 'ID should have no spaces. For example: JAXPE0001_MEIS1, MSKKI119_MEF2C, NWU_AID' + ) + # Create a mask to filter out rows with unwanted starting values + mask = df['expression_alteration_id'].astype(str).str.startswith(unwanted_patterns) + df_filtered = df[~mask] + + # Create ExpressionAlterationStrategy objects + expression_alterations = [] + + for _, row in df_filtered.iterrows(): + expression_alterations.append( + ExpressionAlterationStrategy( + expression_alteration_id=row.get('expression_alteration_id'), + protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), + allele_specific=row.get('gene_expression_alteration_protocol.allele_specific'), + altered_gene_symbols=row.get('gene_expression_alteration_protocol.altered_gene_symbols'), + altered_gene_ids=row.get('gene_expression_alteration_protocol.altered_gene_ids'), + targeted_genomic_region=row.get('gene_expression_alteration_protocol.targeted_genomic_region'), + expected_alteration_type=row.get('gene_expression_alteration_protocol.expected_alteration_type'), + sgrna_target=row.get('gene_expression_alteration_protocol.crispr.sgrna_target'), + protocol_method_text=row.get('gene_expression_alteration_protocol.method.text'), + altered_locus=None, # Placeholder if required + guide_sequence=None, # Placeholder if required + id=row.get('Id') + ) + ) + + return expression_alterations, df_filtered + def get_cell_lines(self, sheet_name, action, errors): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -825,8 +924,8 @@ def get_cell_lines(self, sheet_name, action, errors): list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df = self.parse_cell_lines(sheet_name, action, errors) - return cell_lines, cell_lines_df + cell_lines, cell_lines_df, parent_cell_line_name = self.parse_cell_lines(sheet_name, action, errors) + return cell_lines, cell_lines_df, parent_cell_line_name def get_differentiated_cell_lines(self, sheet_name, action, errors): """ @@ -887,3 +986,7 @@ def get_sequencing_files(self, sheet_name, action, errors): """ sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, errors) return sequencing_files, df_filtered + + def get_expression_alterations(self, sheet_name, action, errors): + expression_alterations, df_filtered = self.parse_expression_alteration(sheet_name, action, errors) + return expression_alterations, df_filtered From 07209394b5c0ffb3fd282ce33c7f83b80ccc9cc4 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 13 Aug 2024 16:17:53 +0100 Subject: [PATCH 31/55] adding expression alteration support and refactoring --- ait/commons/util/command/submit.py | 71 ++++- ait/commons/util/command/submit_file.py | 397 ++++++++++++++---------- ait/commons/util/spreadsheet_util.py | 4 +- 3 files changed, 287 insertions(+), 185 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index acf81bf..c5898be 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -6,10 +6,17 @@ import pandas as pd import numpy as np from urllib.parse import urlparse + from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider +def equality(cell_line, expression_alteration): + return expression_alteration.expression_alteration_id.replace(" ", + "").strip() == cell_line.expression_alteration_id.replace( + " ", "").strip() + + def get_id_from_url(url): """ Extracts and returns the ID from a given URL. @@ -88,7 +95,7 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dataset_id, + def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, submission_envelope_id, dataset_id, access_token, action, errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -124,6 +131,9 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat access_token ) + self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, + expression_alterations) + print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) @@ -138,6 +148,19 @@ def handle_cell_line(self, cell_line, cell_lines_df, submission_envelope_id, dat return cell_line_entity_id + def link_cell_line_with_expression_alterations(self, access_token, cell_line, cell_line_entity_id, + expression_alterations): + for expression_alteration in expression_alterations: + if cell_line.expression_alteration_id is not None: + if equality(cell_line, expression_alteration): + print(f"Linking cell line {cell_line_entity_id} " + f"as derived by process of {expression_alteration.expression_alteration_id}") + + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/derivedByProcesses", + expression_alteration.id, 'processes', access_token + ) + def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, submission_envelope_id, dataset_id, access_token, action, errors): @@ -413,7 +436,10 @@ def create_process(self, access_token, dataset_id, process_data, submission_enve return process_entity_id - def multi_type_submission(self, cell_lines, cell_lines_df, + def multi_type_submission(self, + cell_lines, + expression_alterations, + cell_lines_df, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df, @@ -443,6 +469,7 @@ def multi_type_submission(self, cell_lines, cell_lines_df, for cell_line in cell_lines: cell_line_entity_id = self.handle_cell_line(cell_line, + expression_alterations, cell_lines_df, submission_envelope_id, dataset_id, @@ -491,11 +518,10 @@ def multi_type_submission(self, cell_lines, cell_lines_df, library_preparations_df = None sequencing_file_df = None - return (cell_lines_df, - differentiated_cell_lines_df, - library_preparations_df, - sequencing_file_df, - message) + return ([cell_lines_df, + differentiated_cell_lines_df, + library_preparations_df, + sequencing_file_df], message) def typed_submission(self, type, file, access_token): """ @@ -516,17 +542,32 @@ def typed_submission(self, type, file, access_token): if entity_id: if type == 'dataset': - study_id = self.args.study or input("Input study id to link this dataset: ").lower() - self.link_dataset_to_study(entity_id, study_id, access_token) - + if self.args.study is not None: + study_id = self.args.study + self.link_dataset_to_study(entity_id, study_id, access_token) + else: + link_to_study = input("Do you want to link this dataset to a study? " + "(yes/no): ").lower() + if link_to_study == 'yes': + study_id = input("Input study id: ").lower() + self.link_dataset_to_study(entity_id, study_id, access_token) elif type == 'biomaterial': - dataset_id = self.args.dataset or input("Input dataset id to link this biomaterial: ").lower() - self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + if self.args.dataset is not None: + dataset_id = self.args.dataset + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + else: + link_to_dataset = input("Do you want to link this biomaterial to a " + "dataset? (yes/no): ").lower() + if link_to_dataset == 'yes': + dataset_id = input("Input dataset id: ").lower() - process_id = self.args.process - if process_id: - self.link_biomaterial_to_process(entity_id, process_id, access_token) + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + # Linking biomaterial to process + if self.args.process is not None: + process_id = self.args.process + + self.link_biomaterial_to_process(entity_id, process_id, access_token) return True, entity_id else: print("Unsupported type") diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index dfe2d3c..dc52e2e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,6 +1,5 @@ # Import necessary modules/classes from ait.commons.util package import os -import sys import pandas as pd from ait.commons.util.aws_client import Aws @@ -50,15 +49,6 @@ def create_expression_alterations(submission_instance, submission_envelope_id, a return expression_alterations -def link_cell_line_parent_cell_line_expression_alretation(submission_instance, - submission_envelope_id, - access_token, - parent_cell_line_id, - created_expression_alterations, - cell_lines): - pass - - class CmdSubmitFile: BASE_URL = 'https://api.ingest.dev.archive.morphic.bio' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" @@ -72,38 +62,43 @@ def __init__(self, args): args: Command-line arguments passed to the script. """ self.args = args - self.access_token = get_profile('morphic-util').access_token self.user_profile = get_profile('morphic-util') + self.access_token = self.user_profile.access_token self.aws = Aws(self.user_profile) self.provider_api = APIProvider(self.BASE_URL) self.validation_errors = [] self.submission_errors = [] - - if hasattr(self.args, 'action') and self.args.action is not None: - self.action = self.args.action + self.submission_envelope_id = None + + # Assign and validate required arguments + self.action = self._get_required_arg('action', "Submission action (ADD, MODIFY or DELETE) is mandatory") + self.dataset = self._get_required_arg('dataset', ( + "Dataset is mandatory to be registered before submitting dataset metadata. " + "Please submit your study using the submit option, register your dataset using " + "the same option, and link your dataset to your study before proceeding with this submission." + )) + + # Validate file argument only if action is not DELETE + if self.action != 'DELETE': + self.file = self._get_required_arg('file', "File is mandatory") else: - print("Submission action (ADD, MODIFY or DELETE) is mandatory") - return + print(f"Deleting dataset {self.dataset}") - if hasattr(self.args, 'dataset') and self.args.dataset is not None: - self.dataset = self.args.dataset - else: - print( - "Dataset is mandatory to be registered before submitting dataset metadata, " - "We request you to submit your study using the submit option, register your " - "dataset using the same option and link your dataset to your study " - "before proceeding with this submission." - ) - return + def _get_required_arg(self, attr_name, error_message): + """ + Helper function to get a required argument and print an error message if it's missing. - if hasattr(self.args, 'file') and self.args.file is not None: - self.file = self.args.file - else: - if self.action != 'DELETE': - print("File is mandatory") - return - else: - print(f"Deleting dataset {self.dataset}") + Args: + attr_name (str): The name of the attribute to check in self.args. + error_message (str): The error message to print if the attribute is missing. + + Returns: + The value of the attribute if it exists, otherwise None. + """ + value = getattr(self.args, attr_name, None) + if value is None: + print(error_message) + return value def run(self): """ @@ -111,146 +106,212 @@ def run(self): """ submission_instance = CmdSubmit(self) - if self.action.lower() == 'delete': - self.file = None - submission_instance.delete_dataset(self.dataset, self.access_token) - return True, None + if self._is_delete_action(): + return self._handle_delete(submission_instance) - list_instance = CmdList(self.aws, self.args) - list_of_files_in_upload_area = list_instance.list_bucket_contents_and_return(self.dataset, '') + list_of_files_in_upload_area = self._list_files_in_upload_area() if self.file: - # Initialize SpreadsheetParser with the provided file path - parser = SpreadsheetSubmitter(self.file) - - # Parse different sections of the spreadsheet using defined column mappings - expression_alterations, expression_alterations_df = (parser.get_expression_alterations - ('Expression alteration strategy', - self.action, self.validation_errors)) - cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines('Cell line', self.action, - self.validation_errors) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.validation_errors) - merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, - self.validation_errors) - library_preparations, library_preparations_df = parser.get_library_preparations( - 'Library preparation', self.action, self.validation_errors) - merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, - library_preparations, self.validation_errors) - sequencing_files, sequencing_files_df = parser.get_sequencing_files( - 'Sequence file', self.action, self.validation_errors) - - # validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, self.dataset) - - merge_library_preparation_sequencing_file(library_preparations, sequencing_files, - self.validation_errors) - upload_instance = CmdUpload(self.aws, self.args) - try: - if len(self.validation_errors) > 0: - raise ValidationError(self.validation_errors) - else: - print(f"File {self.file} is validated successfully. Initiating submission") - print(f"File {self.file} being uploaded to storage") - upload_instance.upload_file(self.dataset, self.file, os.path.basename(self.file)) - except ValidationError as e: - print(e) - sys.exit(1) - - if self.action.lower() == 'add': - submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( - self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token - ) - if status_code in (200, 201): - self_url = submission_envelope_response['_links']['self']['href'] - submission_envelope_id = get_id_from_url(self_url) - print(f"Submission envelope for this submission is: {submission_envelope_id}") - else: - if status_code == 401: - message = "Unauthorized, refresh your access token using the config option" - return False, message - else: - return False, f"Encountered failure with status code {status_code}" - else: - submission_envelope_id = None - - # Perform the submission and get the updated dataframes - try: - parent_cell_line_id = (submission_instance. - use_existing_envelope_and_submit_entity('biomaterial', - get_content( - parent_cell_line_name), - submission_envelope_id, - self.access_token)) - - created_expression_alterations = create_expression_alterations(submission_instance, - submission_envelope_id, - self.access_token, - expression_alterations) - - link_cell_line_parent_cell_line_expression_alretation(submission_instance, - submission_envelope_id, - self.access_token, - parent_cell_line_id, - created_expression_alterations, - cell_lines) - # submission now - ( - updated_cell_lines_df, updated_differentiated_cell_lines_df, - updated_library_preparations_df, updated_sequencing_files_df, - message - ) = submission_instance.multi_type_submission( - cell_lines, cell_lines_df, differentiated_cell_lines_df, - library_preparations_df, sequencing_files_df, submission_envelope_id, - self.dataset, self.access_token, self.action, self.submission_errors - ) - - # Save the updated dataframes to a single Excel file with multiple sheets - if message == 'SUCCESS': - output_file = "submission-result.xlsx" - try: - # Write to Excel file - with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_cell_lines_df.to_excel(writer, sheet_name='Cell line', index=False) - updated_differentiated_cell_lines_df.to_excel(writer, sheet_name='Differentiated cell line', - index=False) - updated_library_preparations_df.to_excel(writer, sheet_name='Library preparation', - index=False) - updated_sequencing_files_df.to_excel(writer, sheet_name='Sequence file', index=False) - - # Confirm file was written and path exists - if os.path.exists(output_file): - # Attempt file upload - upload_instance.upload_file(self.dataset, output_file, os.path.basename(output_file)) - print(f"File {output_file} uploaded successfully.") - else: - raise FileNotFoundError( - f"The output file {output_file} was not created or cannot be found.") - - except Exception as e: - print( - f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for " - f"tracing metadata") - return True, "SUBMISSION IS SUCCESSFUL." - else: - return self.delete_actions(submission_envelope_id, - submission_instance, - None) + self._process_submission(submission_instance, list_of_files_in_upload_area) + return True, "SUBMISSION IS SUCCESSFUL." except Exception as e: - return self.delete_actions(submission_envelope_id, submission_instance, e) + return self.delete_actions(self.submission_envelope_id, submission_instance, e) + + def _is_delete_action(self): + """Check if the current action is 'DELETE'.""" + return self.action.lower() == 'delete' + + def _handle_delete(self, submission_instance): + """Handle the deletion of a dataset.""" + self.file = None + submission_instance.delete_dataset(self.dataset, self.access_token) + return True, None + + def _list_files_in_upload_area(self): + """List files in the upload area.""" + list_instance = CmdList(self.aws, self.args) + return list_instance.list_bucket_contents_and_return(self.dataset, '') + + def _process_submission(self, submission_instance, list_of_files_in_upload_area): + """Process the file submission.""" + parser = SpreadsheetSubmitter(self.file) + parsed_data = self._parse_spreadsheet(parser) + self._validate_and_upload(parsed_data, submission_instance, list_of_files_in_upload_area) + + # TODO: Handle expression alterations in MODIFY + if self._is_add_action(): + self._create_submission_envelope(submission_instance) + # TODO: read parent cell line name + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, "test1.1") + created_expression_alterations = self._submit_expression_alterations(submission_instance, parsed_data) + self.link_parent_cell_line_expression_alteration( + submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + ) + + updated_dfs, message = self._perform_main_submission(submission_instance, parsed_data) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) + + def _parse_spreadsheet(self, parser): + """Parse the spreadsheet into different sections.""" + expression_alterations, expression_alterations_df = parser.get_expression_alterations( + 'Expression alteration strategy', self.action, self.validation_errors + ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( + 'Cell line', self.action, self.validation_errors + ) + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + 'Differentiated cell line', self.action, self.validation_errors + ) + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) - def delete_actions(self, submission_envelope_id, submission_instance, e): + library_preparations, library_preparations_df = parser.get_library_preparations( + 'Library preparation', self.action, self.validation_errors + ) + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, + self.validation_errors) + sequencing_files, sequencing_files_df = parser.get_sequencing_files( + 'Sequence file', self.action, self.validation_errors + ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + + return { + "expression_alterations": expression_alterations, + "expression_alterations_df": expression_alterations_df, + "cell_lines": cell_lines, + "cell_lines_df": cell_lines_df, + "parent_cell_line_name": parent_cell_line_name, + "differentiated_cell_lines": differentiated_cell_lines, + "differentiated_cell_lines_df": differentiated_cell_lines_df, + "library_preparations": library_preparations, + "library_preparations_df": library_preparations_df, + "sequencing_files": sequencing_files, + "sequencing_files_df": sequencing_files_df, + } + + def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): + """Validate the parsed data and upload the file.""" + # validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset) + upload_instance = CmdUpload(self.aws, self.args) + + if self.validation_errors: + raise ValidationError(self.validation_errors) + + print(f"File {self.file} is validated successfully. Initiating submission") + print(f"File {self.file} being uploaded to storage") + upload_instance.upload_file(self.dataset, self.file, os.path.basename(self.file)) + + def _is_add_action(self): + """Check if the current action is 'ADD'.""" + return self.action.lower() == 'add' + + def _is_modify_action(self): + """Check if the current action is 'ADD'.""" + return self.action.lower() == 'modify' + + def _create_submission_envelope(self, submission_instance): + """Create a new submission envelope.""" + submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( + self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token + ) + if status_code in (200, 201): + self.submission_envelope_id = get_id_from_url(submission_envelope_response['_links']['self']['href']) + print(f"Submission envelope for this submission is: {self.submission_envelope_id}") + else: + raise Exception(f"Failed to create submission envelope. Status code: {status_code}") + + def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): + """Submit the parent cell line.""" + return submission_instance.use_existing_envelope_and_submit_entity( + 'biomaterial', get_content(parent_cell_line_name), + self.submission_envelope_id, self.access_token + ) + + def _submit_expression_alterations(self, submission_instance, parsed_data): + """Submit expression alterations.""" + return create_expression_alterations( + submission_instance, self.submission_envelope_id, self.access_token, + parsed_data['expression_alterations'] + ) + + def _perform_main_submission(self, submission_instance, parsed_data): + """Perform the main submission.""" + # Unpack the returned values into a list and the message separately + updated_dfs, message = submission_instance.multi_type_submission( + parsed_data['cell_lines'], parsed_data['expression_alterations'], parsed_data['cell_lines_df'], + parsed_data['differentiated_cell_lines_df'], parsed_data['library_preparations_df'], + parsed_data['sequencing_files_df'], self.submission_envelope_id, + self.dataset, self.access_token, self.action, self.submission_errors + ) + return updated_dfs, message + + def _save_and_upload_results(self, updated_dfs): + """Save the updated dataframes and upload the results.""" + output_file = "submission-result.xlsx" try: - print("SUBMISSION has failed, rolling back") - print("SUBMISSION ERRORS are listed below. Any metadata created will be deleted now, please wait until " - "the clean-up finishes") - print("\n".join(self.submission_errors)) - submission_instance.delete_submission(submission_envelope_id, self.access_token, True) - submission_instance.delete_dataset(self.dataset, self.access_token) - - if e is None: - return False, "Submission has failed, rolled back" + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + updated_dfs[0].to_excel(writer, sheet_name='Cell line', index=False) + updated_dfs[1].to_excel(writer, sheet_name='Differentiated cell line', index=False) + updated_dfs[2].to_excel(writer, sheet_name='Library preparation', index=False) + updated_dfs[3].to_excel(writer, sheet_name='Sequence file', index=False) + + if os.path.exists(output_file): + CmdUpload(self.aws, self.args).upload_file(self.dataset, output_file, os.path.basename(output_file)) + print(f"File {output_file} uploaded successfully.") else: - return False, f"An error occurred: {str(e)}" + raise FileNotFoundError(f"The output file {output_file} was not created or cannot be found.") + except Exception as e: + print(f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for tracing metadata") + + def delete_actions(self, submission_envelope_id, submission_instance, error=None): + """Handle actions needed when a submission fails.""" + # TODO: handle validation errors here + try: + if self._is_add_action(): + self._handle_add_action_failure(submission_envelope_id, submission_instance, error) + elif self._is_modify_action(): + self._handle_modify_action_failure(error) except Exception as e: - print(f"Failed to rollback submission {submission_envelope_id}") + print(f"Failed to rollback submission {submission_envelope_id}: {str(e)}") + + def _handle_add_action_failure(self, submission_envelope_id, submission_instance, error): + """Handle failure during 'ADD' action.""" + print("SUBMISSION has failed, rolling back") + print("SUBMISSION ERRORS are listed below. Any metadata created will be deleted now, please wait until " + "the clean-up finishes") + print("\n".join(self.submission_errors)) + + submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + submission_instance.delete_dataset(self.dataset, self.access_token) + + if error: + return False, f"An error occurred: {str(error)}" + else: + return False, "Submission has failed, rolled back" + + def _handle_modify_action_failure(self, error): + """Handle failure during 'MODIFY' action.""" + print("SUBMISSION has failed, contact the support team for next actions") + print("SUBMISSION ERRORS are listed below.") + print("\n".join(self.submission_errors)) + + if error: + return False, f"An error occurred: {str(error)}" + else: + return False, "Submission has failed, rolled back" + + def link_parent_cell_line_expression_alteration(self, submission_instance, + access_token, + parent_cell_line_id, + created_expression_alterations): + for expression_alteration in created_expression_alterations: + print(f"Linking parent cell line {parent_cell_line_id} " + f"as input to process of {expression_alteration.expression_alteration_id}") + submission_instance.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses", + expression_alteration.id, 'processes', access_token + ) diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index e6392fe..4b4c2e7 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -566,7 +566,7 @@ def parse_cell_lines(self, sheet_name, action, errors): biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] derived_from_accession = row.get('cell_line.derived_cell_line_accession') cell_type = row.get('cell_line.type') - # expression_alteration_id = row.get('expression_alteration_id') + expression_alteration_id = row.get('expression_alteration_id') # Error handling for missing mandatory fields if pd.isnull(biomaterial_id): @@ -586,7 +586,7 @@ def parse_cell_lines(self, sheet_name, action, errors): protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), zygosity=row.get('cell_line.zygosity'), cell_type=cell_type, - expression_alteration_id=None, + expression_alteration_id=expression_alteration_id, id=row.get('Id') ) ) From 753a3ad67dde469a58178f060a8af9aa31aaeda9 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 14 Aug 2024 11:54:43 +0100 Subject: [PATCH 32/55] adding expression alteration update support and more refactoring --- ait/commons/util/command/submit.py | 515 +++++++++++++----------- ait/commons/util/command/submit_file.py | 175 +++++--- ait/commons/util/spreadsheet_util.py | 1 + 3 files changed, 391 insertions(+), 300 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index c5898be..34d1794 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -42,6 +42,127 @@ def get_process_content(name): return process_data +def update_dataframe(input_df, created_entity_id, entity_id, raw_entity_rep_column_name): + """ + Updates the DataFrame with the new or modified cell line entity ID. + Returns: + - None + """ + entity_id_column_name = "Id" + + if entity_id_column_name not in input_df.columns: + input_df[entity_id_column_name] = np.nan + + input_df[entity_id_column_name] = input_df[entity_id_column_name].astype(object) + + input_df.loc[ + input_df[raw_entity_rep_column_name] == entity_id, + entity_id_column_name + ] = created_entity_id + + +def transform(file): + """ + Transforms the input file to a JSON object. + + Parameters: + file (str): The file path. + + Returns: + dict: The JSON object. + """ + if file.endswith('.tsv'): + json_data = [] + with open(file, 'r', newline='') as file: + reader = csv.DictReader(file, delimiter='\t') + for row in reader: + json_data.append(row) + return {'content': json_data} + + elif file.endswith('.csv'): + df = pd.read_csv(file) + return {'content': df.to_dict(orient='records')} + + else: + with open(file, 'r') as file: + return json.load(file) + + +def create_new_submission_envelope(url, access_token): + """ + Creates a new submission envelope. + + Parameters: + url (str): The URL to send the request to. + access_token (str): Access token for authorization. + + Returns: + tuple: A tuple containing the response data and the status code. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json={}) + status_code = response.status_code + + if status_code in {200, 201}: + response_data = response.json() + return response_data, status_code + + return None, status_code + + +def post_to_provider_api_and_get_entity_id(url, data, access_token): + """ + Sends a POST request to the specified URL and returns the entity ID from the response. + + Parameters: + url (str): The URL to send the request to. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. + + Returns: + str: The entity ID extracted from the response URL. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + entity_url = response_data['_links']['self']['href'] + + return get_id_from_url(entity_url) + + +def post_to_provider_api(url, data_type_in_hal_link, data, access_token): + """ + Sends a POST request to the specified URL. + + Parameters: + url (str): The URL to send the request to. + data_type_in_hal_link (str): The data type in the HAL link. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. + + Returns: + str: The URL from the response. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + url = response_data['_links'][data_type_in_hal_link]['href'] + + return url + + class CmdSubmit: """ A class to handle submission of studies, datasets, and biomaterials to a server. @@ -105,48 +226,61 @@ def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, sub - cell_lines_df: DataFrame containing information about cell lines. - submission_envelope_id: ID of the submission envelope where the entity will be submitted. - access_token: Access token for authentication and authorization. + - action: The action to be performed, either 'create' or 'modify'. + - errors: List to accumulate any error messages encountered. Returns: - - cell_line_entity_id: Entity ID of the submitted cell line biomaterial. + - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. """ - if action == 'modify' or action == 'MODIFY': + if action.lower() == 'modify': success = self.patchEntity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) - if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") + update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') else: errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") + return cell_line.id else: - cell_line_entity_id_column_name = "Id" - - if cell_line_entity_id_column_name not in cell_lines_df.columns: - cell_lines_df[cell_line_entity_id_column_name] = np.nan + cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, + submission_envelope_id, dataset_id, access_token) + update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') + return cell_line_entity_id - print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") + def create_cell_line_entity(self, cell_line, expression_alterations, submission_envelope_id, + dataset_id, + access_token): + """ + Creates a new cell line entity and links it with a dataset and expression alterations. - cell_line_entity_id = self.use_existing_envelope_and_submit_entity( - 'biomaterial', - cell_line.to_dict(), - submission_envelope_id, - access_token - ) + Parameters: + - cell_line: The cell line object to be created. + - expression_alterations: Any associated expression alterations. + - submission_envelope_id: ID of the submission envelope where the entity will be submitted. + - dataset_id: The dataset ID to link the cell line entity to. + - access_token: Access token for authentication and authorization. - self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, - expression_alterations) + Returns: + - cell_line_entity_id: The ID of the newly created cell line entity. + """ + print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") - print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") + cell_line_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + cell_line.to_dict(), + submission_envelope_id, + access_token + ) - self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) + self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, + expression_alterations) - cell_lines_df[cell_line_entity_id_column_name] = (cell_lines_df[cell_line_entity_id_column_name] - .astype(object)) + print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") - cell_lines_df.loc[ - cell_lines_df['cell_line.biomaterial_core.biomaterial_id'] == cell_line.biomaterial_id, - cell_line_entity_id_column_name - ] = cell_line_entity_id + self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) - return cell_line_entity_id + return cell_line_entity_id def link_cell_line_with_expression_alterations(self, access_token, cell_line, cell_line_entity_id, expression_alterations): @@ -181,83 +315,79 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce success = self.patchEntity('biomaterial', differentiated_cell_line.id, differentiated_cell_line.to_dict(), access_token) - if success: print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") + + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') else: errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") + return differentiated_cell_line.id else: - print("Cell line has differentiated cell lines, creating differentiation process to link them") - - differentiation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('differentiation'), - submission_envelope_id) - - differentiated_biomaterial_to_entity_id_map = {} - differentiated_cell_line_entity_id_column_name = "Id" - - if differentiated_cell_line_entity_id_column_name not in differentiated_cell_lines_df.columns: - differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = np.nan - - print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " - f"as a child of Cell line: {cell_line_entity_id}") - - differentiated_entity_id = self.create_child_biomaterial( - cell_line_entity_id, - differentiated_cell_line.to_dict(), - access_token - ) - - print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to envelope: {submission_envelope_id}") - - self.link_entity_to_envelope( - 'biomaterial', - differentiated_entity_id, - submission_envelope_id, - access_token - ) - - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to dataset: {dataset_id}") + differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id) + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') + return differentiated_cell_line_id + + def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_id, dataset_id, + differentiated_cell_line, + submission_envelope_id): + print("Cell line has differentiated cell lines, creating differentiation process to link them") + + differentiation_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id) + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") + + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) - self.link_to_dataset('biomaterial', dataset_id, - differentiated_entity_id, access_token) + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") - print(f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " - f"input to process : {differentiation_process_entity_id}") + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + self.link_to_dataset('biomaterial', dataset_id, + differentiated_entity_id, access_token) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + print(f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " + f"input to process : {differentiation_process_entity_id}") - differentiated_biomaterial_to_entity_id_map[ - differentiated_cell_line.biomaterial_id] = differentiated_entity_id + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) - differentiated_cell_lines_df[differentiated_cell_line_entity_id_column_name] = differentiated_cell_lines_df[ - differentiated_cell_line_entity_id_column_name].astype(object) + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") - differentiated_cell_lines_df.loc[ - differentiated_cell_lines_df[ - 'differentiated_cell_line.biomaterial_core.biomaterial_id'] == differentiated_cell_line.biomaterial_id, - differentiated_cell_line_entity_id_column_name - ] = differentiated_entity_id + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) - return differentiated_entity_id + return differentiated_entity_id def handle_library_preparation(self, differentiated_entity_id, library_preparation, library_preparations_df, submission_envelope_id, @@ -278,71 +408,78 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati success = self.patchEntity('biomaterial', library_preparation.id, library_preparation.to_dict(), access_token) - if success: print(f"Updated library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") + + update_dataframe(library_preparations_df, library_preparation.id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') else: errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") - else: - print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id}") - library_preparation_entity_id = self.create_child_biomaterial( - differentiated_entity_id, - library_preparation.to_dict(), - access_token - ) + return library_preparation.id + else: + library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, + differentiated_entity_id, + library_preparation, + submission_envelope_id) + update_dataframe(library_preparations_df, library_preparation_entity_id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') - print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") + return library_preparation_entity_id - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to envelope: {submission_envelope_id}") + def create_library_preparation_entity(self, access_token, dataset_id, differentiated_entity_id, library_preparation, + submission_envelope_id): + print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " + f"{differentiated_entity_id}") - self.link_entity_to_envelope( - 'biomaterial', - library_preparation_entity_id, - submission_envelope_id, - access_token - ) + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to dataset: {dataset_id}") + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"to envelope: {submission_envelope_id}") - self.link_to_dataset('biomaterial', dataset_id, - library_preparation_entity_id, access_token) + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"as input to library preparation process") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"to dataset: {dataset_id}") - library_preparation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('library_preparation'), - submission_envelope_id) + self.link_to_dataset('biomaterial', dataset_id, + library_preparation_entity_id, access_token) - library_preparation_entity_id_column_name = "Id" + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"as input to library preparation process") - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"as derived by library preparation process") + library_preparation_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id) + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " + f"as derived by library preparation process") - library_preparations_df.loc[ - library_preparations_df[ - 'library_preparation.biomaterial_core.biomaterial_id'] == library_preparation.biomaterial_id, - library_preparation_entity_id_column_name - ] = library_preparation_entity_id + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) - return library_preparation_entity_id + return library_preparation_entity_id def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, dataset_id, @@ -536,7 +673,7 @@ def typed_submission(self, type, file, access_token): tuple: A tuple containing a boolean indicating success and the ID of the created entity. """ if type in ['study', 'dataset', 'biomaterial', 'process', 'file']: - data = self.transform(file) if file is not None else {} + data = transform(file) if file is not None else {} entity_id = self.create_new_envelope_and_submit_entity(type, data, access_token) @@ -596,9 +733,9 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ if not hal_entity: return None - entity_create_url = self.post_to_provider_api(self.submission_envelope_create_url, hal_entity, None, - access_token) - entity_self_hal_link = self.post_to_provider_api(entity_create_url, 'self', data, access_token) + entity_create_url = post_to_provider_api(self.submission_envelope_create_url, hal_entity, data, + access_token) + entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -670,7 +807,7 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submi return None entity_create_url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/{hal_entity}" - entity_self_hal_link = self.post_to_provider_api(entity_create_url, 'self', data, access_token) + entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) entity_id = get_id_from_url(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -723,30 +860,6 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): url = f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses" self.perform_hal_linkage(url, process_id, 'processes', access_token) - def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): - """ - Sends a POST request to the specified URL. - - Parameters: - url (str): The URL to send the request to. - data_type_in_hal_link (str): The data type in the HAL link. - data (dict): The data to be sent in the POST request. - access_token (str): Access token for authorization. - - Returns: - str: The URL from the response. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {access_token}' - } - - response = requests.post(url, headers=headers, json=data) - response_data = response.json() - url = response_data['_links'][data_type_in_hal_link]['href'] - - return url - def delete_submission(self, submission_envelope_id, access_token, force_delete=False): """ Sends a DELETE request to delete a submission envelope. @@ -771,54 +884,6 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F return response.status_code // 100 == 2 - def post_to_provider_api_and_get_entity_id(self, url, data, access_token): - """ - Sends a POST request to the specified URL and returns the entity ID from the response. - - Parameters: - url (str): The URL to send the request to. - data (dict): The data to be sent in the POST request. - access_token (str): Access token for authorization. - - Returns: - str: The entity ID extracted from the response URL. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {access_token}' - } - - response = requests.post(url, headers=headers, json=data) - response_data = response.json() - entity_url = response_data['_links']['self']['href'] - - return get_id_from_url(entity_url) - - def create_new_submission_envelope(self, url, access_token): - """ - Creates a new submission envelope. - - Parameters: - url (str): The URL to send the request to. - access_token (str): Access token for authorization. - - Returns: - tuple: A tuple containing the response data and the status code. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {access_token}' - } - - response = requests.post(url, headers=headers, json={}) - status_code = response.status_code - - if status_code in {200, 201}: - response_data = response.json() - return response_data, status_code - - return None, status_code - def perform_hal_linkage(self, url, input_id, link_to, access_token): """ Performs HAL linkage by sending a POST request. @@ -845,36 +910,10 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): else: print("Linkage successful") - def transform(self, file): - """ - Transforms the input file to a JSON object. - - Parameters: - file (str): The file path. - - Returns: - dict: The JSON object. - """ - if file.endswith('.tsv'): - json_data = [] - with open(file, 'r', newline='') as file: - reader = csv.DictReader(file, delimiter='\t') - for row in reader: - json_data.append(row) - return {'content': json_data} - - elif file.endswith('.csv'): - df = pd.read_csv(file) - return {'content': df.to_dict(orient='records')} - - else: - with open(file, 'r') as file: - return json.load(file) - def create_child_biomaterial(self, cell_line_entity_id, body, access_token): url = f"{self.base_url}/biomaterials/{cell_line_entity_id}/childBiomaterials" - entity_id = self.post_to_provider_api_and_get_entity_id(url, body, access_token) + entity_id = post_to_provider_api_and_get_entity_id(url, body, access_token) return entity_id def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, access_token): @@ -889,10 +928,10 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces """ if type == 'biomaterial': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" + self.provider_api.put_to_provider_api(url, access_token) elif type == 'file': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" - - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put_to_provider_api(url, access_token) def delete_dataset(self, dataset, access_token): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index dc52e2e..e3e0118 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -1,10 +1,13 @@ # Import necessary modules/classes from ait.commons.util package import os +import sys +from datetime import datetime +import numpy as np import pandas as pd from ait.commons.util.aws_client import Aws from ait.commons.util.command.list import CmdList -from ait.commons.util.command.submit import CmdSubmit, get_id_from_url +from ait.commons.util.command.submit import CmdSubmit, get_id_from_url, create_new_submission_envelope from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider @@ -34,7 +37,15 @@ def get_content(unique_value): return {"content": unique_value} -def create_expression_alterations(submission_instance, submission_envelope_id, access_token, expression_alterations): +def create_expression_alterations(submission_instance, submission_envelope_id, access_token, parsed_data): + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] + + expression_alterations_entity_id_column_name = "Id" + + if expression_alterations_entity_id_column_name not in expression_alterations_df.columns: + expression_alterations_df[expression_alterations_entity_id_column_name] = np.nan + for expression_alteration in expression_alterations: # Submit the expression alteration and retrieve the ID expression_alteration_id = submission_instance.use_existing_envelope_and_submit_entity( @@ -45,8 +56,16 @@ def create_expression_alterations(submission_instance, submission_envelope_id, a ) # Set the retrieved ID in the ExpressionAlterationStrategy object expression_alteration.id = expression_alteration_id + expression_alterations_df[expression_alterations_entity_id_column_name] = ( + expression_alterations_df[expression_alterations_entity_id_column_name] + .astype(object)) + expression_alterations_df.loc[ + expression_alterations_df[ + 'expression_alteration_id'] == expression_alteration.expression_alteration_id, + expression_alterations_entity_id_column_name + ] = expression_alteration_id - return expression_alterations + return expression_alterations, expression_alterations_df class CmdSubmitFile: @@ -98,6 +117,7 @@ def _get_required_arg(self, attr_name, error_message): value = getattr(self.args, attr_name, None) if value is None: print(error_message) + sys.exit(1) return value def run(self): @@ -106,17 +126,28 @@ def run(self): """ submission_instance = CmdSubmit(self) - if self._is_delete_action(): - return self._handle_delete(submission_instance) - - list_of_files_in_upload_area = self._list_files_in_upload_area() - - if self.file: - try: - self._process_submission(submission_instance, list_of_files_in_upload_area) - return True, "SUBMISSION IS SUCCESSFUL." - except Exception as e: - return self.delete_actions(self.submission_envelope_id, submission_instance, e) + try: + if self._is_delete_action(): + return self._handle_delete(submission_instance) + + list_of_files_in_upload_area = self._list_files_in_upload_area() + + if self.file: + try: + self._process_submission(submission_instance, list_of_files_in_upload_area) + return True, "SUBMISSION IS SUCCESSFUL." + except Exception as e: + return self.delete_actions(self.submission_envelope_id, submission_instance, e) + except KeyboardInterrupt: + # Handle the interruption and exit gracefully + print("\nProcess interrupted by user. Exiting gracefully...") + self.delete_actions(self.submission_envelope_id, submission_instance, None) + sys.exit(0) # Exit with a zero status code indicating a clean exit + except Exception as e: + # Handle any other unexpected exceptions + print(f"An unexpected error occurred: {str(e)}") + self.delete_actions(self.submission_envelope_id, submission_instance, None) + sys.exit(1) # Exit with a non-zero status code indicating an error def _is_delete_action(self): """Check if the current action is 'DELETE'.""" @@ -138,13 +169,21 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) parser = SpreadsheetSubmitter(self.file) parsed_data = self._parse_spreadsheet(parser) self._validate_and_upload(parsed_data, submission_instance, list_of_files_in_upload_area) + # original expression alteration data frame + expression_alteration_df = parsed_data['expression_alterations_df'] + parent_cell_line_name = parsed_data['parent_cell_line_name'] # TODO: Handle expression alterations in MODIFY if self._is_add_action(): self._create_submission_envelope(submission_instance) - # TODO: read parent cell line name - parent_cell_line_id = self._submit_parent_cell_line(submission_instance, "test1.1") - created_expression_alterations = self._submit_expression_alterations(submission_instance, parsed_data) + print(f"Creating parental cell line with name {parent_cell_line_name}") + + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + + created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( + submission_instance, parsed_data) self.link_parent_cell_line_expression_alteration( submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations ) @@ -152,57 +191,68 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) updated_dfs, message = self._perform_main_submission(submission_instance, parsed_data) if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs) + self._save_and_upload_results(updated_dfs, expression_alteration_df) else: return self.delete_actions(self.submission_envelope_id, submission_instance, None) def _parse_spreadsheet(self, parser): - """Parse the spreadsheet into different sections.""" - expression_alterations, expression_alterations_df = parser.get_expression_alterations( - 'Expression alteration strategy', self.action, self.validation_errors - ) - cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( - 'Cell line', self.action, self.validation_errors - ) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.validation_errors - ) - merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) + try: + """Parse the spreadsheet into different sections.""" + expression_alterations, expression_alterations_df = parser.get_expression_alterations( + 'Expression alteration strategy', self.action, self.validation_errors + ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( + 'Cell line', self.action, self.validation_errors + ) + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + 'Differentiated cell line', self.action, self.validation_errors + ) + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) - library_preparations, library_preparations_df = parser.get_library_preparations( - 'Library preparation', self.action, self.validation_errors - ) - merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, - self.validation_errors) - sequencing_files, sequencing_files_df = parser.get_sequencing_files( - 'Sequence file', self.action, self.validation_errors - ) - merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) - - return { - "expression_alterations": expression_alterations, - "expression_alterations_df": expression_alterations_df, - "cell_lines": cell_lines, - "cell_lines_df": cell_lines_df, - "parent_cell_line_name": parent_cell_line_name, - "differentiated_cell_lines": differentiated_cell_lines, - "differentiated_cell_lines_df": differentiated_cell_lines_df, - "library_preparations": library_preparations, - "library_preparations_df": library_preparations_df, - "sequencing_files": sequencing_files, - "sequencing_files_df": sequencing_files_df, - } + library_preparations, library_preparations_df = parser.get_library_preparations( + 'Library preparation', self.action, self.validation_errors + ) + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, + self.validation_errors) + sequencing_files, sequencing_files_df = parser.get_sequencing_files( + 'Sequence file', self.action, self.validation_errors + ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + + return { + "expression_alterations": expression_alterations, + "expression_alterations_df": expression_alterations_df, + "cell_lines": cell_lines, + "cell_lines_df": cell_lines_df, + "parent_cell_line_name": parent_cell_line_name, + "differentiated_cell_lines": differentiated_cell_lines, + "differentiated_cell_lines_df": differentiated_cell_lines_df, + "library_preparations": library_preparations, + "library_preparations_df": library_preparations_df, + "sequencing_files": sequencing_files, + "sequencing_files_df": sequencing_files_df, + } + except Exception: + self.validation_errors.append(f"Spreadsheet is invalid {self.file}") + return None def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): """Validate the parsed data and upload the file.""" # validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset) - upload_instance = CmdUpload(self.aws, self.args) - - if self.validation_errors: - raise ValidationError(self.validation_errors) + try: + # exit now if there are validation errors in the spreadsheet + if self.validation_errors: + raise ValidationError(self.validation_errors) + except ValidationError as e: + # Print the error message + print(e) + # Exit the program with a non-zero status code to indicate an error + sys.exit(1) print(f"File {self.file} is validated successfully. Initiating submission") print(f"File {self.file} being uploaded to storage") + + upload_instance = CmdUpload(self.aws, self.args) upload_instance.upload_file(self.dataset, self.file, os.path.basename(self.file)) def _is_add_action(self): @@ -210,12 +260,12 @@ def _is_add_action(self): return self.action.lower() == 'add' def _is_modify_action(self): - """Check if the current action is 'ADD'.""" + """Check if the current action is 'MODIFY'.""" return self.action.lower() == 'modify' def _create_submission_envelope(self, submission_instance): """Create a new submission envelope.""" - submission_envelope_response, status_code = submission_instance.create_new_submission_envelope( + submission_envelope_response, status_code = create_new_submission_envelope( self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token ) if status_code in (200, 201): @@ -235,7 +285,7 @@ def _submit_expression_alterations(self, submission_instance, parsed_data): """Submit expression alterations.""" return create_expression_alterations( submission_instance, self.submission_envelope_id, self.access_token, - parsed_data['expression_alterations'] + parsed_data ) def _perform_main_submission(self, submission_instance, parsed_data): @@ -249,15 +299,17 @@ def _perform_main_submission(self, submission_instance, parsed_data): ) return updated_dfs, message - def _save_and_upload_results(self, updated_dfs): + def _save_and_upload_results(self, updated_dfs, expression_alteration_df): """Save the updated dataframes and upload the results.""" - output_file = "submission-result.xlsx" + current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + output_file = f"submission_result_{current_time}.xlsx" try: with pd.ExcelWriter(output_file, engine='openpyxl') as writer: updated_dfs[0].to_excel(writer, sheet_name='Cell line', index=False) updated_dfs[1].to_excel(writer, sheet_name='Differentiated cell line', index=False) updated_dfs[2].to_excel(writer, sheet_name='Library preparation', index=False) updated_dfs[3].to_excel(writer, sheet_name='Sequence file', index=False) + expression_alteration_df.to_excel(writer, sheet_name='Expression alteration strategy', index=False) if os.path.exists(output_file): CmdUpload(self.aws, self.args).upload_file(self.dataset, output_file, os.path.basename(output_file)) @@ -269,7 +321,6 @@ def _save_and_upload_results(self, updated_dfs): def delete_actions(self, submission_envelope_id, submission_instance, error=None): """Handle actions needed when a submission fails.""" - # TODO: handle validation errors here try: if self._is_add_action(): self._handle_add_action_failure(submission_envelope_id, submission_instance, error) diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 4b4c2e7..dcdc036 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -526,6 +526,7 @@ def parse_cell_lines(self, sheet_name, action, errors): """ df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) df.columns = df.columns.str.strip() + parent_cell_line_names = [] # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: From 2a3946ada7a550a66ffc0d249c237e6d3c62ba7d Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 14 Aug 2024 12:21:10 +0100 Subject: [PATCH 33/55] file validation errors appended to validation errors list --- ait/commons/util/command/submit_file.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index e3e0118..e1c6ccb 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -17,7 +17,7 @@ # Define a class for handling submission of a command file -def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, dataset): +def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, dataset, errors): for sequencing_file in sequencing_files: match_found = False # Flag to indicate if a match is found @@ -27,7 +27,7 @@ def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, da break # Exit the inner loop if a match is found if not match_found: - raise Exception( + errors.append( f"No matching file found for sequencing file: {sequencing_file.file_name} " f"in the upload area for the dataset: {dataset}" ) @@ -238,7 +238,9 @@ def _parse_spreadsheet(self, parser): def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): """Validate the parsed data and upload the file.""" - # validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset) + validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, + self.validation_errors) + try: # exit now if there are validation errors in the spreadsheet if self.validation_errors: From 94ecc2c4a004791501ff706ce15bedc3228c22ca Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 14 Aug 2024 12:38:13 +0100 Subject: [PATCH 34/55] README.md updated --- README.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/README.md b/README.md index 6a0f6b6..0c4f470 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,64 @@ optional arguments: -a delete all files from the area -d delete upload area and contents (authorised users only) ``` +## Performing a submission +### Authenticate +```shell script +$ morphic-util config username password + +positional arguments: + username AWS Cognito username + password AWS Cognito password +``` +### Create your study +```shell script +positional arguments: +$ morphic-util submit --type study --file + + --type type of metadata being submitted (here it is study) + --file path to the file containing the metadata +``` +### Create your dataset and link it to your study +```shell script +positional arguments: +$ morphic-util submit --type dataset --file --study + + --type type of metadata being submitted (here it is dataset) + --file path to the file containing the metadata (optional) + --study STUDY_ID obtained in the last step +``` +### `select` your upload area to upload your data files (the upload area name is same as your DATASET_ID) +Show or select the data file upload area +```shell script +$ morphic-util select AREA + +positional arguments: + AREA upload area name (same as DATASET_ID obtained in the last step). +``` +### `upload` your data files +Upload files to the selected area for the dataset +```shell script +$ morphic-util upload PATH [PATH ...] [-o] + +positional arguments: + PATH valid file or directory + +optional arguments: + -o overwrite files with same names +``` +### `list` uploaded data files to verify that data file upload has been successful +```shell script +$ morphic-util list +``` +### `submit-file` command to submit your dataset metadata containing your biomaterials, processes, protocols and files +```shell script +positional arguments: +$ morphic-util submit-file --file --action --dataset + + --file path to the file containing the metadata + --action ADD, MODIFY or DELETE based on the type of submission + --dataset the identifier for the analysis +``` # Developers From 37c1722b00656598fc2829b5f0b38ecc86359b80 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 14 Aug 2024 12:42:16 +0100 Subject: [PATCH 35/55] update version --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 42d7519..dc856ab 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.17' +VERSION = '0.0.18' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From e7f68b442ff8c077266e10cd3bffd216f1e79e13 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 19 Aug 2024 08:48:23 +0100 Subject: [PATCH 36/55] align data format with schema and update version --- ait/commons/util/command/submit_file.py | 3 +- ait/commons/util/settings/morphic_util.py | 2 +- ait/commons/util/spreadsheet_util.py | 169 +++++++++++++--------- 3 files changed, 106 insertions(+), 68 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index e1c6ccb..0e31849 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -237,9 +237,10 @@ def _parse_spreadsheet(self, parser): return None def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): - """Validate the parsed data and upload the file.""" + """Validate the parsed data and upload the file. validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) + """ try: # exit now if there are validation errors in the spreadsheet diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index dc856ab..b45f0af 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.18' +VERSION = '0.0.19' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index dcdc036..a7863d7 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -57,17 +57,26 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): + content = { + "label": self.biomaterial_id, + "description": self.description, + "derived_from_cell_line": self.derived_from_accession, + "zygosity": self.zygosity, + "type": self.cell_type + } + + # Only add optional/custom fields if they are provided + if self.clone_id: + content["clone_id"] = self.clone_id # Not in schema, custom field + + if self.protocol_id: + content["protocol_id"] = self.protocol_id # Not in schema, custom field + + if self.expression_alteration_id: + content["expression_alteration_id"] = self.expression_alteration_id # Not in schema, custom field + return { - "content": { - "biomaterial_id": self.biomaterial_id, - "description": self.description, - "derived_from_accession": self.derived_from_accession, - "clone_id": self.clone_id, - "protocol_id": self.protocol_id, - "zygosity": self.zygosity, - "cell_type": self.cell_type, - "expression_alteration_id": self.expression_alteration_id - } + "content": content } @@ -94,7 +103,7 @@ def __repr__(self): def to_dict(self): return { "content": { - "expression_alteration_id": self.expression_alteration_id, + "label": self.expression_alteration_id, "protocol_id": self.protocol_id, "allele_specific": self.allele_specific, "altered_gene_symbols": self.altered_gene_symbols, @@ -131,17 +140,24 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): + content = { + "label": self.biomaterial_id, + "description": self.description, + "timepoint_value": self.timepoint_value, + "timepoint_unit": self.timepoint_unit, + "terminally_differentiated": self.terminally_differentiated, + "model_system": self.model_system + } + + # Only add optional/custom fields if they are provided + if self.input_biomaterial_id: + content["input_biomaterial_id"] = self.input_biomaterial_id # Not in schema, custom field + + if self.protocol_id: + content["protocol_id"] = self.protocol_id # Not in schema, custom field + return { - "content": { - "biomaterial_id": self.biomaterial_id, - "description": self.description, - "input_biomaterial_id": self.input_biomaterial_id, - "protocol_id": self.protocol_id, - "timepoint_value": self.timepoint_value, - "timepoint_unit": self.timepoint_unit, - "terminally_differentiated": self.terminally_differentiated, - "model_system": self.model_system - } + "content": content } @@ -173,29 +189,35 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): - # Replace NaN values and out-of-range float values with None - def convert_to_valid_json_value(obj): - if isinstance(obj, float): - if np.isnan(obj) or not np.isfinite(obj): - return None - return obj + # Helper function to handle invalid JSON values + def convert_to_valid_json_value(value): + if isinstance(value, float) and (np.isnan(value) or not np.isfinite(value)): + return None + return value + + content = { + "label": self.biomaterial_id, + "average_fragment_size": convert_to_valid_json_value(self.average_fragment_size), + "input_amount_value": convert_to_valid_json_value(self.input_amount_value), + "input_amount_unit": self.input_amount_unit, + "total_yield_value": convert_to_valid_json_value(self.final_yield_value), + "total_yield_unit": self.final_yield_unit, + "concentration_value": convert_to_valid_json_value(self.concentration_value), + "concentration_unit": self.concentration_unit, + "pcr_cycles": self.pcr_cycles, + "pcr_cycles_for_sample_index": convert_to_valid_json_value(self.pcr_cycles_for_sample_index) + } + + # Add optional/custom fields if they are provided + if self.protocol_id: + content["protocol_id"] = self.protocol_id # Not in schema, custom field + if self.dissociation_protocol_id: + content["dissociation_protocol_id"] = self.dissociation_protocol_id # Not in schema, custom field + if self.differentiated_biomaterial_id: + content["differentiated_biomaterial_id"] = self.differentiated_biomaterial_id # Not in schema, custom field return { - "content": { - "biomaterial_id": self.biomaterial_id, - "protocol_id": self.protocol_id, - "dissociation_protocol_id": self.dissociation_protocol_id, - "differentiated_biomaterial_id": self.differentiated_biomaterial_id, - "average_fragment_size": convert_to_valid_json_value(self.average_fragment_size), - "input_amount_value": convert_to_valid_json_value(self.input_amount_value), - "input_amount_unit": self.input_amount_unit, - "final_yield_value": convert_to_valid_json_value(self.final_yield_value), - "final_yield_unit": self.final_yield_unit, - "concentration_value": convert_to_valid_json_value(self.concentration_value), - "concentration_unit": self.concentration_unit, - "pcr_cycles": self.pcr_cycles, - "pcr_cycles_for_sample_index": convert_to_valid_json_value(self.pcr_cycles_for_sample_index) - } + "content": content } @@ -204,39 +226,50 @@ class EntityType: class SequencingFile: - def __init__(self, file_name, library_preparation_id, sequencing_protocol_id, read_index, run_id, id): + def __init__(self, file_name, extension, read_index, lane_index=None, read_length=None, checksum=None, + library_preparation_id=None, sequencing_protocol_id=None, run_id=None, id=None): self.file_name = file_name - self.library_preparation_id = library_preparation_id - self.sequencing_protocol_id = sequencing_protocol_id + self.extension = extension self.read_index = read_index - self.run_id = run_id - self.entity_type = EntityType.FILE - self.id = id - self.content = { - "file_name": self.file_name, - "library_preparation_id": self.library_preparation_id, - "sequencing_protocol_id": self.sequencing_protocol_id, - "read_index": self.read_index, - "run_id": self.run_id - } - self.set_file_name(file_name) - self.init_file() + self.lane_index = lane_index + self.read_length = read_length + self.checksum = checksum + self.library_preparation_id = library_preparation_id # Custom field + self.sequencing_protocol_id = sequencing_protocol_id # Custom field + self.run_id = run_id # Custom field + self.id = id # Custom field def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): - return { - "content": self.content, - "fileName": self.file_name + # Helper function to handle invalid JSON values + def convert_to_valid_json_value(value): + if isinstance(value, float) and (np.isnan(value) or not np.isfinite(value)): + return None + return value + + content = { + "label": self.file_name, + "extension": self.extension, + "read_index": self.read_index, + "lane_index": convert_to_valid_json_value(self.lane_index), + "read_length": convert_to_valid_json_value(self.read_length), + "checksum": self.checksum } - def set_file_name(self, file_name): - self.file_name = file_name + # Add optional/custom fields if they are provided + if self.library_preparation_id: + content["library_preparation_id"] = self.library_preparation_id # Not in schema, custom field + if self.sequencing_protocol_id: + content["sequencing_protocol_id"] = self.sequencing_protocol_id # Not in schema, custom field + if self.run_id: + content["run_id"] = self.run_id # Not in schema, custom field - def init_file(self): - # Placeholder for any initialization logic required for the file - pass + return { + "content": content, + "fileName": self.file_name + } def find_orphans(source_entities, target_entities, @@ -837,9 +870,13 @@ def parse_sequencing_files(self, sheet_name, action, errors): sequencing_files.append( SequencingFile( file_name=file_name, + extension=None, + read_index=read_index, + lane_index=None, + read_length=None, + checksum=None, library_preparation_id=library_preparation_id, sequencing_protocol_id=sequencing_protocol_id, - read_index=read_index, run_id=row.get('sequence_file.run_id'), id=row.get('Id') ) From 7ff442abab3e6e7be9d0b4e8018e4c2470baa697 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 19 Aug 2024 09:03:01 +0100 Subject: [PATCH 37/55] validate sequencing files uncommented --- ait/commons/util/command/submit_file.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 0e31849..9c6df64 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -237,10 +237,9 @@ def _parse_spreadsheet(self, parser): return None def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): - """Validate the parsed data and upload the file. + # Validate the parsed data and upload the file. validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) - """ try: # exit now if there are validation errors in the spreadsheet From c6f7d53d43713866a77e2df31e4f3bff97947b24 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 19 Aug 2024 09:04:31 +0100 Subject: [PATCH 38/55] incr version --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index b45f0af..c7433af 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.19' +VERSION = '0.0.20' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 92ad56bc43cc241f35948df9c6fd472435afc489 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 19 Aug 2024 09:23:53 +0100 Subject: [PATCH 39/55] incr version to 0.0.21 for test pypi issues --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index c7433af..fefb993 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.20' +VERSION = '0.0.21' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 2adb8d21bae426794e2766c7b95e10f71f5ece78 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 19 Aug 2024 13:10:10 +0100 Subject: [PATCH 40/55] incr version to 1.0.0 for new major version with submission support --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index fefb993..2655adf 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.21' +VERSION = '1.0.0' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 8001ad1117020b476d9c3333ca7ecb9db6c231a0 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Sat, 31 Aug 2024 12:32:28 +0100 Subject: [PATCH 41/55] code clean-up --- ait/commons/util/command/submit.py | 209 +++++++++++++++--------- ait/commons/util/command/submit_file.py | 176 +++++++++++++++----- ait/commons/util/spreadsheet_util.py | 36 +++- 3 files changed, 298 insertions(+), 123 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 34d1794..397290e 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -11,7 +11,7 @@ from ait.commons.util.provider_api_util import APIProvider -def equality(cell_line, expression_alteration): +def matching_expression_alteration_and_cell_line(cell_line, expression_alteration): return expression_alteration.expression_alteration_id.replace(" ", "").strip() == cell_line.expression_alteration_id.replace( " ", "").strip() @@ -273,8 +273,9 @@ def create_cell_line_entity(self, cell_line, expression_alterations, submission_ access_token ) - self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, - expression_alterations) + if expression_alterations is not None: + self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, + expression_alterations) print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") @@ -286,7 +287,7 @@ def link_cell_line_with_expression_alterations(self, access_token, cell_line, ce expression_alterations): for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: - if equality(cell_line, expression_alteration): + if matching_expression_alteration_and_cell_line(cell_line, expression_alteration): print(f"Linking cell line {cell_line_entity_id} " f"as derived by process of {expression_alteration.expression_alteration_id}") @@ -337,57 +338,121 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce return differentiated_cell_line_id def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_id, dataset_id, - differentiated_cell_line, - submission_envelope_id): - print("Cell line has differentiated cell lines, creating differentiation process to link them") - - differentiation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('differentiation'), - submission_envelope_id) - print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " - f"as a child of Cell line: {cell_line_entity_id}") - - differentiated_entity_id = self.create_child_biomaterial( - cell_line_entity_id, - differentiated_cell_line.to_dict(), - access_token - ) + differentiated_cell_line, submission_envelope_id): + """ + Creates a Differentiated Cell Line entity and links it to the submission envelope. - print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to envelope: {submission_envelope_id}") + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + dataset_id : str + The dataset ID to link with. + differentiated_cell_line : object + The differentiated cell line object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. - self.link_entity_to_envelope( - 'biomaterial', - differentiated_entity_id, - submission_envelope_id, - access_token - ) + Returns: + -------- + str + The ID of the created differentiated cell line entity. + """ + + # Create the differentiated cell line biomaterial + if cell_line_entity_id is not None: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) + + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") + + # Link the differentiated cell line entity to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id}") + differentiated_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + differentiated_cell_line.to_dict(), + submission_envelope_id, + access_token + ) print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " f"to dataset: {dataset_id}") + # Link the differentiated cell line to the dataset self.link_to_dataset('biomaterial', dataset_id, differentiated_entity_id, access_token) - print(f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " - f"input to process : {differentiation_process_entity_id}") + return differentiated_entity_id - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_entity_id, differentiated_entity_id, + dataset_id, submission_envelope_id, action): + """ + Creates and links the differentiation process between the original cell line and the differentiated cell line. - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + Returns: + -------- + str + The ID of the differentiation process entity created. + """ + if action.lower() != 'modify': + print("Cell line has differentiated cell lines, creating differentiation process to link them") + + # Create a differentiation process entity + differentiation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id + ) - return differentiated_entity_id + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + + # Link the cell line entity as input to the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") + + # Link the differentiated cell line entity as derived by the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + return differentiation_process_entity_id def handle_library_preparation(self, differentiated_entity_id, library_preparation, library_preparations_df, submission_envelope_id, @@ -577,6 +642,7 @@ def multi_type_submission(self, cell_lines, expression_alterations, cell_lines_df, + differentiated_cell_lines, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df, @@ -605,45 +671,38 @@ def multi_type_submission(self, try: for cell_line in cell_lines: - cell_line_entity_id = self.handle_cell_line(cell_line, - expression_alterations, - cell_lines_df, + if cell_line.id is not None: + cell_line_entity_id = cell_line.id + + for differentiated_cell_line in cell_line.differentiated_cell_lines: + differentiated_cell_line_entity_id = differentiated_cell_line.id + + self.link_cell_line_and_differentiated_cell_line(access_token, cell_line_entity_id, + differentiated_cell_line_entity_id, + dataset_id, submission_envelope_id + , action) + + for library_preparation in differentiated_cell_line.library_preparations: + library_preparation_entity_id = self.handle_library_preparation( + differentiated_cell_line_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors) + + for sequencing_file in library_preparation.sequencing_files: + self.handle_sequencing_file(library_preparation_entity_id, + sequencing_file, + sequencing_file_df, submission_envelope_id, dataset_id, access_token, action, errors) - for differentiated_cell_line in cell_line.differentiated_cell_lines: - differentiated_entity_id = self.handle_differentiated_cell_line(cell_line_entity_id, - differentiated_cell_line, - differentiated_cell_lines_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for library_preparation in differentiated_cell_line.library_preparations: - library_preparation_entity_id = self.handle_library_preparation(differentiated_entity_id, - library_preparation, - library_preparations_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for sequencing_file in library_preparation.sequencing_files: - self.handle_sequencing_file(library_preparation_entity_id, - sequencing_file, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - message = 'SUCCESS' except Exception as e: message = f"An error occurred: {str(e)}" diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 9c6df64..f3d7f6a 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -37,10 +37,9 @@ def get_content(unique_value): return {"content": unique_value} -def create_expression_alterations(submission_instance, submission_envelope_id, access_token, parsed_data): - expression_alterations = parsed_data['expression_alterations'] - expression_alterations_df = parsed_data['expression_alterations_df'] - +def _create_expression_alterations(submission_instance, submission_envelope_id, access_token, + expression_alterations, + expression_alterations_df): expression_alterations_entity_id_column_name = "Id" if expression_alterations_entity_id_column_name not in expression_alterations_df.columns: @@ -168,32 +167,79 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) """Process the file submission.""" parser = SpreadsheetSubmitter(self.file) parsed_data = self._parse_spreadsheet(parser) - self._validate_and_upload(parsed_data, submission_instance, list_of_files_in_upload_area) - # original expression alteration data frame - expression_alteration_df = parsed_data['expression_alterations_df'] + self._validate_and_upload(parsed_data, list_of_files_in_upload_area) + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] parent_cell_line_name = parsed_data['parent_cell_line_name'] + cell_lines = parsed_data['cell_lines'] + cell_lines_df = parsed_data['cell_lines_df'] + differentiated_cell_lines = parsed_data['differentiated_cell_lines'] + differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] # TODO: Handle expression alterations in MODIFY if self._is_add_action(): self._create_submission_envelope(submission_instance) - print(f"Creating parental cell line with name {parent_cell_line_name}") - parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + parent_cell_line_id = None + created_expression_alterations = [] - print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + if parent_cell_line_name is not None: + print(f"Creating parental cell line with name {parent_cell_line_name}") + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + # TODO: link parental cell line with dataset - created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( - submission_instance, parsed_data) - self.link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations - ) + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") - updated_dfs, message = self._perform_main_submission(submission_instance, parsed_data) + if expression_alterations and expression_alterations_df is not None: + created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df) - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alteration_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if created_expression_alterations and parent_cell_line_id is not None: + self.link_parent_cell_line_expression_alteration( + submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + ) + + created_cell_lines = [] + + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df) + + created_differentiated_cell_lines = [] + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, + cell_lines_df, created_differentiated_cell_lines, + differentiated_cell_lines_df, parsed_data) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) + elif self._is_modify_action(): + created_cell_lines = [] + + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df) + + created_differentiated_cell_lines = [] + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, + cell_lines_df, created_differentiated_cell_lines, + differentiated_cell_lines_df, parsed_data) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) def _parse_spreadsheet(self, parser): try: @@ -236,20 +282,39 @@ def _parse_spreadsheet(self, parser): self.validation_errors.append(f"Spreadsheet is invalid {self.file}") return None - def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): + def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): + """ # Validate the parsed data and upload the file. validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) - + """ + """ + Handle validation errors, including interacting with the user in case of a missing sheet. + """ try: - # exit now if there are validation errors in the spreadsheet + # Exit now if there are validation errors in the spreadsheet if self.validation_errors: raise ValidationError(self.validation_errors) except ValidationError as e: - # Print the error message - print(e) - # Exit the program with a non-zero status code to indicate an error - sys.exit(1) + # Check if the error is related to a missing sheet + missing_sheet_errors = [msg for msg in self.validation_errors if "Missing sheet" in msg] + + if missing_sheet_errors: + # Extract the sheet name(s) from the errors + missing_sheets = ', '.join([msg.split("'")[1] for msg in missing_sheet_errors]) + # Ask the user whether to proceed + user_response = input( + f"A required sheet '{missing_sheets}' is missing. Do you want to proceed anyway? (yes/no): ").strip().lower() + if user_response == 'yes': + print("Proceeding with execution...") + else: + print("Execution terminated due to missing required sheet.") + sys.exit(1) + else: + # Print the error message + print(e) + # Exit the program with a non-zero status code to indicate an error + sys.exit(1) print(f"File {self.file} is validated successfully. Initiating submission") print(f"File {self.file} being uploaded to storage") @@ -283,19 +348,45 @@ def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): self.submission_envelope_id, self.access_token ) - def _submit_expression_alterations(self, submission_instance, parsed_data): + def _submit_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df): """Submit expression alterations.""" - return create_expression_alterations( + return _create_expression_alterations( submission_instance, self.submission_envelope_id, self.access_token, - parsed_data + expression_alterations, expression_alterations_df ) - def _perform_main_submission(self, submission_instance, parsed_data): + def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df): + for cell_line in cell_lines: + cell_line_entity_id = submission_instance.handle_cell_line(cell_line, None, cell_lines_df, + self.submission_envelope_id, self.dataset, + self.access_token, self.action, + self.submission_errors) + cell_line.id = cell_line_entity_id + + return cell_lines, cell_lines_df + + def _create_differentiated_cell_lines(self, submission_instance, differentiated_cell_lines, + differentiated_cell_lines_df): + for differentiated_cell_line in differentiated_cell_lines: + differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, + differentiated_cell_line, + differentiated_cell_lines_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + differentiated_cell_line.id = differentiated_cell_line_entity_id + + return differentiated_cell_lines, differentiated_cell_lines_df + + def _perform_main_submission(self, submission_instance, created_cell_lines, cell_lines_df, + created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data): """Perform the main submission.""" # Unpack the returned values into a list and the message separately updated_dfs, message = submission_instance.multi_type_submission( - parsed_data['cell_lines'], parsed_data['expression_alterations'], parsed_data['cell_lines_df'], - parsed_data['differentiated_cell_lines_df'], parsed_data['library_preparations_df'], + created_cell_lines, parsed_data['expression_alterations'], cell_lines_df, + created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data['library_preparations_df'], parsed_data['sequencing_files_df'], self.submission_envelope_id, self.dataset, self.access_token, self.action, self.submission_errors ) @@ -306,13 +397,20 @@ def _save_and_upload_results(self, updated_dfs, expression_alteration_df): current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file = f"submission_result_{current_time}.xlsx" try: + # List of updated DataFrames and corresponding sheet names + dataframes = [ + (updated_dfs[0], 'Cell line'), + (updated_dfs[1], 'Differentiated cell line'), + (updated_dfs[2], 'Library preparation'), + (updated_dfs[3], 'Sequence file'), + (expression_alteration_df, 'Expression alteration strategy') + ] + + # Create the Excel file and write only non-null DataFrames with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_dfs[0].to_excel(writer, sheet_name='Cell line', index=False) - updated_dfs[1].to_excel(writer, sheet_name='Differentiated cell line', index=False) - updated_dfs[2].to_excel(writer, sheet_name='Library preparation', index=False) - updated_dfs[3].to_excel(writer, sheet_name='Sequence file', index=False) - expression_alteration_df.to_excel(writer, sheet_name='Expression alteration strategy', index=False) - + for df, sheet_name in dataframes: + if df is not None: # Check if the DataFrame is not None + df.to_excel(writer, sheet_name=sheet_name, index=False) if os.path.exists(output_file): CmdUpload(self.aws, self.args).upload_file(self.dataset, output_file, os.path.basename(output_file)) print(f"File {output_file} uploaded successfully.") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index a7863d7..28dad58 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -892,14 +892,29 @@ def parse_expression_alteration(self, sheet_name, action, errors): ----------- sheet_name : str The name of the sheet containing expression alterations data. + action : str + The action to be performed on the data. + errors : list + A list to accumulate error messages. Returns: -------- - list - A list of ExpressionAlterationStrategy objects parsed from the specified sheet. + tuple + A tuple containing: + - A list of ExpressionAlterationStrategy objects parsed from the specified sheet (if valid) + - The filtered DataFrame of the parsed data + - A boolean indicating whether the expression alteration strategy sheet exists and is valid """ - df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + # Attempt to parse the input file into a DataFrame + try: + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + except Exception as e: + errors.append(f"Missing sheet '{sheet_name}': {e}") + return [], None + + # Strip whitespace from column names df.columns = df.columns.str.strip() + # Check if the required column exists required_columns = ['expression_alteration_id'] missing_columns = [col for col in required_columns if col not in df.columns] @@ -907,23 +922,26 @@ def parse_expression_alteration(self, sheet_name, action, errors): if missing_columns: errors.append( f"The following required columns are missing in the Expression Alteration Strategy sheet: {', '.join(missing_columns)}") - return [], df # Return early if required columns are missing + return None, df, False # Return if required columns are missing # Filter rows where 'expression_alteration_id' is not null df = df[df['expression_alteration_id'].notna()] + # Replace invalid float values (e.g., NaN, infinite) with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Define unwanted patterns + + # Define unwanted patterns to filter out unwanted rows unwanted_patterns = ( 'FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the gene expression alteration instance..', 'ID should have no spaces. For example: JAXPE0001_MEIS1, MSKKI119_MEF2C, NWU_AID' ) + # Create a mask to filter out rows with unwanted starting values mask = df['expression_alteration_id'].astype(str).str.startswith(unwanted_patterns) df_filtered = df[~mask] - # Create ExpressionAlterationStrategy objects + # Initialize the list of ExpressionAlterationStrategy objects expression_alterations = [] for _, row in df_filtered.iterrows(): @@ -944,6 +962,7 @@ def parse_expression_alteration(self, sheet_name, action, errors): ) ) + # Return the list of objects, the filtered DataFrame, and a flag indicating success return expression_alterations, df_filtered def get_cell_lines(self, sheet_name, action, errors): @@ -981,9 +1000,8 @@ def get_differentiated_cell_lines(self, sheet_name, action, errors): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - differentiated_cell_lines, differentiated_cell_lines_df = (self. - parse_differentiated_cell_lines - (sheet_name, action, errors)) + differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name, + action, errors) return differentiated_cell_lines, differentiated_cell_lines_df def get_library_preparations(self, sheet_name, action, errors): From 1940cdd040dec66b56d0cdef6a05faa93ad925d4 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 2 Sep 2024 14:42:44 +0100 Subject: [PATCH 42/55] code improvements --- ait/commons/util/command/submit.py | 415 ++++++++++++++++-------- ait/commons/util/command/submit_file.py | 138 +++++--- ait/commons/util/command/view.py | 16 +- ait/commons/util/provider_api_util.py | 22 +- 4 files changed, 389 insertions(+), 202 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 397290e..0905775 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -216,7 +216,8 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, submission_envelope_id, dataset_id, + def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, + submission_envelope_id, dataset_id, access_token, action, errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -233,7 +234,7 @@ def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, sub - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) + success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, @@ -313,9 +314,9 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', differentiated_cell_line.id, - differentiated_cell_line.to_dict(), - access_token) + success = self.patch_entity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), + access_token) if success: print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") @@ -424,7 +425,8 @@ def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_en The ID of the differentiation process entity created. """ if action.lower() != 'modify': - print("Cell line has differentiated cell lines, creating differentiation process to link them") + print(f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"to link them") # Create a differentiation process entity differentiation_process_entity_id = self.create_process( @@ -470,9 +472,9 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', library_preparation.id, - library_preparation.to_dict(), - access_token) + success = self.patch_entity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) if success: print(f"Updated library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") @@ -498,54 +500,128 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati def create_library_preparation_entity(self, access_token, dataset_id, differentiated_entity_id, library_preparation, submission_envelope_id): - print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id}") + """ + Creates a Library Preparation entity for the Differentiated Cell Line and links it to the submission envelope and dataset. - library_preparation_entity_id = self.create_child_biomaterial( - differentiated_entity_id, - library_preparation.to_dict(), - access_token - ) + Parameters: + ----------- + access_token : str + The authentication token. + dataset_id : str + The dataset ID to link with. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation : object + The library preparation object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. - print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to envelope: {submission_envelope_id}") + Returns: + -------- + str + The ID of the created library preparation entity. + """ + if differentiated_entity_id is not None: + print( + f"Creating Library Preparation as child of Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - self.link_entity_to_envelope( - 'biomaterial', - library_preparation_entity_id, - submission_envelope_id, - access_token - ) + # Create the library preparation biomaterial + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to dataset: {dataset_id}") + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - self.link_to_dataset('biomaterial', dataset_id, - library_preparation_entity_id, access_token) + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to envelope: {submission_envelope_id}") - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"as input to library preparation process") - - library_preparation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('library_preparation'), - submission_envelope_id) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) + # Link the library preparation to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Library preparation Biomaterial: {library_preparation.biomaterial_id}") + library_preparation_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + library_preparation.to_dict(), + submission_envelope_id, + access_token + ) - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"as derived by library preparation process") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to dataset: {dataset_id}") - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) + # Link the library preparation to the dataset + self.link_to_dataset('biomaterial', dataset_id, library_preparation_entity_id, access_token) return library_preparation_entity_id + def link_differentiated_and_library_preparation(self, + access_token, + differentiated_entity_id, + library_preparation_entity_id, + dataset_id, + submission_envelope_id, + action): + """ + Links the Differentiated Cell Line to the Library Preparation through a library preparation process. + + Parameters: + ----------- + access_token : str + The authentication token. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation_entity_id : str + The ID of the library preparation entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the library preparation process entity created. + """ + if action.lower() != 'modify': + print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + f"preparation process to link them") + + # Create a library preparation process entity + library_preparation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " + f"preparation process") + + # Link the differentiated cell line entity as input to the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " + f"preparation process") + + # Link the library preparation entity as derived by the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + return library_preparation_process_entity_id + def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, dataset_id, access_token, action, errors): @@ -561,67 +637,131 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('file', sequencing_file.id, - sequencing_file.to_dict(), - access_token) + success = self.patch_entity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) if success: - print(f"Updated sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + print(f"Updated sequencing file: {sequencing_file.id} / " + f"{sequencing_file.file_name}") + + update_dataframe(sequencing_file_df, sequencing_file.id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') else: errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + + return sequencing_file.id else: - print("Creating sequencing process to link the sequencing file") + sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, + dataset_id, + library_preparation_entity_id, + sequencing_file, + submission_envelope_id) + update_dataframe(sequencing_file_df, sequencing_file_entity_id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') + + return sequencing_file_entity_id + + def create_sequencing_file_entity(self, access_token, dataset_id, library_preparation_entity_id, sequencing_file, + submission_envelope_id): + """ + Creates a Sequencing File entity for the Library Preparation and links it to the submission envelope and dataset. - sequencing_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('sequencing'), - submission_envelope_id) + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file : object + The sequencing file object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. + dataset_id : str + The dataset ID to link with. + access_token : str + The authentication token. - sequencing_file_entity_id_column_name = "Id" + Returns: + -------- + str + The ID of the created sequencing file entity. + """ - if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: - sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan + print( + f"Creating Sequencing file: {sequencing_file.file_name} as a result of sequencing the Library preparation " + f"biomaterial: {library_preparation_entity_id}") - print(f"Creating Sequencing file: {sequencing_file.file_name} " - f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) - sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( - 'file', - sequencing_file.to_dict(), - submission_envelope_id, - access_token - ) + print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") + + self.link_to_dataset('file', dataset_id, sequencing_file_entity_id, access_token) - print(f"Created Sequencing file: {sequencing_file_entity_id}") + return sequencing_file_entity_id + + def link_library_preparation_and_sequencing_file(self, + access_token, + library_preparation_entity_id, + sequencing_file_entity_id, + dataset_id, + submission_envelope_id, + action): + """ + Links the Library Preparation to the Sequencing File through a sequencing process. + + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file_entity_id : str + The ID of the sequencing file entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + access_token : str + The authentication token. - print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") + Returns: + -------- + str + The ID of the sequencing process entity created. + """ + if action.lower() != 'modify': + print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." + f"Creating sequencing process to link the sequencing file") - self.link_to_dataset('file', dataset_id, - sequencing_file_entity_id, access_token) + # Create a sequencing process entity + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) - print(f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " - f"as input to process: {sequencing_process_entity_id}") + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + # Link the library preparation entity as input to the sequencing process self.perform_hal_linkage( f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", sequencing_process_entity_id, 'processes', access_token ) - print(f"Linking Sequencing file: {sequencing_file_entity_id} " - f"as derived by process: {sequencing_process_entity_id}") + print( + f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + # Link the sequencing file entity as derived by the sequencing process self.perform_hal_linkage( f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", sequencing_process_entity_id, 'processes', access_token ) - sequencing_file_df[sequencing_file_entity_id_column_name] = sequencing_file_df[ - sequencing_file_entity_id_column_name].astype(object) - - sequencing_file_df.loc[ - sequencing_file_df['sequence_file.file_core.file_name'] == sequencing_file.file_name, - sequencing_file_entity_id_column_name - ] = sequencing_file_entity_id + return sequencing_process_entity_id def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -638,19 +778,20 @@ def create_process(self, access_token, dataset_id, process_data, submission_enve return process_entity_id - def multi_type_submission(self, - cell_lines, - expression_alterations, - cell_lines_df, - differentiated_cell_lines, - differentiated_cell_lines_df, - library_preparations_df, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors): + def establish_links(self, + cell_lines, + cell_lines_df, + differentiated_cell_lines, + differentiated_cell_lines_df, + library_preparations, + library_preparations_df, + sequencing_files, + sequencing_files_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) @@ -670,38 +811,34 @@ def multi_type_submission(self, """ try: for cell_line in cell_lines: - - if cell_line.id is not None: - cell_line_entity_id = cell_line.id - - for differentiated_cell_line in cell_line.differentiated_cell_lines: - differentiated_cell_line_entity_id = differentiated_cell_line.id - - self.link_cell_line_and_differentiated_cell_line(access_token, cell_line_entity_id, - differentiated_cell_line_entity_id, - dataset_id, submission_envelope_id - , action) - - for library_preparation in differentiated_cell_line.library_preparations: - library_preparation_entity_id = self.handle_library_preparation( - differentiated_cell_line_entity_id, - library_preparation, - library_preparations_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for sequencing_file in library_preparation.sequencing_files: - self.handle_sequencing_file(library_preparation_entity_id, - sequencing_file, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) + for differentiated_cell_line in differentiated_cell_lines: + if cell_line.biomaterial_id == differentiated_cell_line.input_biomaterial_id: + self.link_cell_line_and_differentiated_cell_line(access_token, + cell_line.id, + differentiated_cell_line.id, + dataset_id, + submission_envelope_id, + action) + for differentiated_cell_line in differentiated_cell_lines: + for library_preparation in library_preparations: + if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: + self.link_differentiated_and_library_preparation( + access_token, + differentiated_cell_line.id, + library_preparation.id, + dataset_id, + submission_envelope_id, + action) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if library_preparation.biomaterial_id == sequencing_file.library_preparation_id: + self.link_library_preparation_and_sequencing_file(access_token, + library_preparation.id, + sequencing_file.id, + dataset_id, + submission_envelope_id, + action) message = 'SUCCESS' except Exception as e: @@ -712,12 +849,12 @@ def multi_type_submission(self, cell_lines_df = None differentiated_cell_lines_df = None library_preparations_df = None - sequencing_file_df = None + sequencing_files_df = None return ([cell_lines_df, differentiated_cell_lines_df, library_preparations_df, - sequencing_file_df], message) + sequencing_files_df], message) def typed_submission(self, type, file, access_token): """ @@ -801,7 +938,7 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ return entity_id - def patchEntity(self, input_entity_type, id, data, access_token): + def patch_entity(self, input_entity_type, id, data, access_token): entity_map = { 'study': 'studies', 'dataset': 'datasets', @@ -829,7 +966,7 @@ def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token return False put_url = f"{self.base_url}/datasets/{dataset_id}/{hal_entity}/{entity_id}" - return self.provider_api.put_to_provider_api(put_url, access_token) + return self.provider_api.put(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): headers = { @@ -885,7 +1022,7 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): print(f"Linking dataset {dataset_id} to study {study_id}") url = f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) print(f"Dataset linked successfully to study: {study_id}") @@ -901,7 +1038,7 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") url = f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) print(f"Biomaterial linked successfully to dataset: {dataset_id}") @@ -987,10 +1124,10 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces """ if type == 'biomaterial': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) elif type == 'file': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) def delete_dataset(self, dataset, access_token): """ @@ -1000,7 +1137,7 @@ def delete_dataset(self, dataset, access_token): dataset (str): The ID of the dataset to delete. access_token (str): Access token for authorization. """ - fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) + fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{dataset}", access_token) print(f"Dataset fetched successfully: {dataset}") print(f"Initiating delete of {dataset}") @@ -1011,17 +1148,17 @@ def delete_dataset(self, dataset, access_token): print("Deleting Biomaterials:") for biomaterial in biomaterials: print(f"Deleting {biomaterial}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", access_token) + self.provider_api.delete(f"{self.base_url}/biomaterials/{biomaterial}", access_token) print("\nDeleting Processes:") for process in processes: print(f"Deleting {process}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/processes/{process}", access_token) + self.provider_api.delete(f"{self.base_url}/processes/{process}", access_token) print("\nDeleting Data Files:") for data_file in data_files: print(f"Deleting {data_file}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/files/{data_file}", access_token) + self.provider_api.delete(f"{self.base_url}/files/{data_file}", access_token) print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) + self.provider_api.delete(f"{self.base_url}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index f3d7f6a..62e986a 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -175,13 +175,18 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) cell_lines_df = parsed_data['cell_lines_df'] differentiated_cell_lines = parsed_data['differentiated_cell_lines'] differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + library_preparations = parsed_data['library_preparations'] + library_preparations_df = parsed_data['library_preparations_df'] + sequencing_files = parsed_data['sequencing_files'] + sequencing_files_df = parsed_data['sequencing_files_df'] # TODO: Handle expression alterations in MODIFY + created_expression_alterations = [] + if self._is_add_action(): - self._create_submission_envelope(submission_instance) + self._create_submission_envelope() parent_cell_line_id = None - created_expression_alterations = [] if parent_cell_line_name is not None: print(f"Creating parental cell line with name {parent_cell_line_name}") @@ -199,47 +204,44 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations ) - created_cell_lines = [] + created_cell_lines = [] - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df) + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df, created_expression_alterations) - created_differentiated_cell_lines = [] + created_differentiated_cell_lines = [] - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) - updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, - cell_lines_df, created_differentiated_cell_lines, - differentiated_cell_lines_df, parsed_data) + created_library_preparations = [] - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) - elif self._is_modify_action(): - created_cell_lines = [] - - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df) + if library_preparations and library_preparations_df is not None: + created_library_preparations, library_preparations_df = self._create_library_preparations( + submission_instance, library_preparations, library_preparations_df) - created_differentiated_cell_lines = [] + created_sequencing_files = [] - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if sequencing_files and sequencing_files_df is not None: + created_sequencing_files, sequencing_files_df = self._create_sequencing_files( + submission_instance, sequencing_files, sequencing_files_df) - updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, - cell_lines_df, created_differentiated_cell_lines, - differentiated_cell_lines_df, parsed_data) + updated_dfs, message = self._establish_links(submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df) - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) def _parse_spreadsheet(self, parser): try: @@ -330,7 +332,7 @@ def _is_modify_action(self): """Check if the current action is 'MODIFY'.""" return self.action.lower() == 'modify' - def _create_submission_envelope(self, submission_instance): + def _create_submission_envelope(self): """Create a new submission envelope.""" submission_envelope_response, status_code = create_new_submission_envelope( self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token @@ -355,9 +357,9 @@ def _submit_expression_alterations(self, submission_instance, expression_alterat expression_alterations, expression_alterations_df ) - def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df): + def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, expression_alterations): for cell_line in cell_lines: - cell_line_entity_id = submission_instance.handle_cell_line(cell_line, None, cell_lines_df, + cell_line_entity_id = submission_instance.handle_cell_line(cell_line, expression_alterations, cell_lines_df, self.submission_envelope_id, self.dataset, self.access_token, self.action, self.submission_errors) @@ -380,16 +382,64 @@ def _create_differentiated_cell_lines(self, submission_instance, differentiated_ return differentiated_cell_lines, differentiated_cell_lines_df - def _perform_main_submission(self, submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data): + def _create_library_preparations(self, submission_instance, library_preparations, + library_preparations_df): + for library_preparation in library_preparations: + library_preparation_entity_id = submission_instance.handle_library_preparation(None, + library_preparation, + library_preparations_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + library_preparation.id = library_preparation_entity_id + + return library_preparations, library_preparations_df + + def _create_sequencing_files(self, submission_instance, sequencing_files, + sequencing_files_df): + for sequencing_file in sequencing_files: + sequencing_file_entity_id = submission_instance.handle_sequencing_file(None, + sequencing_file, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + sequencing_file.id = sequencing_file_entity_id + + return sequencing_files, sequencing_files_df + + def _establish_links(self, + submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df): """Perform the main submission.""" # Unpack the returned values into a list and the message separately - updated_dfs, message = submission_instance.multi_type_submission( - created_cell_lines, parsed_data['expression_alterations'], cell_lines_df, - created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data['library_preparations_df'], - parsed_data['sequencing_files_df'], self.submission_envelope_id, - self.dataset, self.access_token, self.action, self.submission_errors + updated_dfs, message = submission_instance.establish_links( + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors ) + return updated_dfs, message def _save_and_upload_results(self, updated_dfs, expression_alteration_df): diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py index adf1a89..aa8fc74 100644 --- a/ait/commons/util/command/view.py +++ b/ait/commons/util/command/view.py @@ -18,8 +18,8 @@ def __init__(self, args): print("Dataset is mandatory for view") def run(self): - fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{self.dataset}", - self.access_token) + fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{self.dataset}", + self.access_token) print(f"Dataset fetched successfully: {self.dataset}") print("Getting Biomaterials") biomaterials = fetched_dataset.get('biomaterials', []) @@ -27,8 +27,8 @@ def run(self): for biomaterial in biomaterials: print(biomaterial) - fetched_biomaterial = self.provider_api.get_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", - self.access_token) + fetched_biomaterial = self.provider_api.get(f"{self.base_url}/biomaterials/{biomaterial}", + self.access_token) print(fetched_biomaterial) print("Getting Processes") @@ -37,8 +37,8 @@ def run(self): for process in processes: print(process) - fetched_process = self.provider_api.get_to_provider_api(f"{self.base_url}/processes/{process}", - self.access_token) + fetched_process = self.provider_api.get(f"{self.base_url}/processes/{process}", + self.access_token) print(fetched_process) print("Getting Data Files") @@ -47,8 +47,8 @@ def run(self): for file in files: print(files) - fetched_file = self.provider_api.get_to_provider_api(f"{self.base_url}/files/{file}", - self.access_token) + fetched_file = self.provider_api.get(f"{self.base_url}/files/{file}", + self.access_token) print(fetched_file) return True, "FETCHED SUCCESSFULLY" diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py index 774dad6..851b052 100644 --- a/ait/commons/util/provider_api_util.py +++ b/ait/commons/util/provider_api_util.py @@ -5,7 +5,7 @@ class APIProvider: def __init__(self, base_url): self.base_url = base_url - def send_request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): + def request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): """ Sends an HTTP request to the specified URL with the given method. @@ -74,18 +74,18 @@ def send_request(self, method, url, access_token, params=None, data=None, data_t # Return the JSON-parsed response data for other successful requests return response.json() - def put_to_provider_api(self, url, access_token): - return self.send_request('PUT', url, access_token) + def put(self, url, access_token): + return self.request('PUT', url, access_token) - def get_to_provider_api(self, url, access_token): - return self.send_request('GET', url, access_token) + def get(self, url, access_token): + return self.request('GET', url, access_token) - def delete_to_provider_api_including_linked_entities(self, url, access_token, delete_linked_entities=False): + def delete_with_relations(self, url, access_token, delete_linked_entities=False): params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} - return self.send_request('DELETE', url, access_token, params=params) + return self.request('DELETE', url, access_token, params=params) - def delete_to_provider_api(self, url, access_token): - return self.send_request('DELETE', url, access_token) + def delete(self, url, access_token): + return self.request('DELETE', url, access_token) - def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): - return self.send_request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) + def post(self, url, data_type_in_hal_link, data, access_token): + return self.request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) From 03976d6d4a43025167f7d57c998ec16173f8f2ab Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 3 Sep 2024 11:35:39 +0100 Subject: [PATCH 43/55] handling md5 checksums and new type of sheet having clonal and undifferentiated --- ait/commons/util/command/list.py | 27 ++++++++++++++------ ait/commons/util/command/submit_file.py | 34 ++++++++++++++++++++++--- ait/commons/util/command/upload.py | 17 ++++++++++++- 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index c28c7a7..ef5261e 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -10,6 +10,10 @@ def print_area(k, area): p = area.get('perms') or '' print(p.ljust(3), end=' ') + if 'md5' in area: + p = area.get('md5') or '' + print(p.ljust(3), end=' ') + if 'name' in area: n = area.get('name') print(f'{n}' if n else '', end=' ') @@ -44,13 +48,12 @@ def run(self): def list_bucket_contents(self, selected_area, prefix=''): result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) - # Folders dirs = result.get('CommonPrefixes', []) for d in dirs: k = d.get('Prefix') - print_area(k, {'key': k, 'perms': 'dir'}) + print_area(k, {'key': k, 'md5': None, 'perms': 'dir'}) self.list_bucket_contents(selected_area, prefix=k) # Files @@ -58,7 +61,10 @@ def list_bucket_contents(self, selected_area, prefix=''): for f in files: k = f.get('Key') - print_area(k, {'key': k, 'perms': 'file'}) + head_object_response = self.s3_cli.head_object(Bucket=selected_area, Key=k) + metadata = head_object_response.get('Metadata', {}) + hash_md5 = metadata.get('md5', 'MD5 checksum not found') + print_area(k, {'key': k, 'md5': hash_md5, 'perms': 'file'}) def list_bucket_contents_and_return(self, selected_area, prefix=''): """ @@ -71,26 +77,31 @@ def list_bucket_contents_and_return(self, selected_area, prefix=''): Returns: - A list of file keys in the bucket. """ - file_keys = [] + file_keys = [] # Initialize an empty list to store file keys. + # Define the recursive function to list bucket contents. def _list_bucket_contents(bucket, prefix): + # Call AWS S3 API to list objects with a specific prefix. result = self.s3_cli.list_objects_v2(Bucket=bucket, Delimiter='/', Prefix=prefix) - # Folders + # Handle directories (folders) first. dirs = result.get('CommonPrefixes', []) for d in dirs: k = d.get('Prefix') - # print_area(k, {'key': k, 'perms': 'dir'}) + # Recursively call the function to list contents of the subdirectory. _list_bucket_contents(bucket, prefix=k) - # Files + # Handle files at the current prefix level. files = result.get('Contents', []) for f in files: k = f.get('Key') - # print_area(k, {'key': k, 'perms': 'file'}) + # Add each file key to the list. file_keys.append(k) + # Start the recursive process to list all contents from the given prefix. _list_bucket_contents(selected_area, prefix) + + # Return the final list of all file keys found in the bucket. return file_keys diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 62e986a..aaf8f7e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -182,7 +182,7 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) # TODO: Handle expression alterations in MODIFY created_expression_alterations = [] - + if self._is_add_action(): self._create_submission_envelope() @@ -245,26 +245,54 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) def _parse_spreadsheet(self, parser): try: + tab_names = parser.list_sheets() + cell_line_sheet_name = None + differentiated_cell_line_sheet_name = None + + if "Cell line" in tab_names: + cell_line_sheet_name = "Cell line" + elif "Clonal cell line" in tab_names: + cell_line_sheet_name = "Clonal cell line" + else: + self.validation_errors.append("Spreadsheet must contain a " + "'Cell line' or 'Clonal cell line' sheet.") + + if "Differentiated cell line" in tab_names: + differentiated_cell_line_sheet_name = "Differentiated cell line" + # elif "Undifferentiated product" in tab_names: + # differentiated_cell_line_sheet_name = "Undifferentiated product" + elif "Differentiated product" in tab_names: + differentiated_cell_line_sheet_name = "Differentiated product" + else: + self.validation_errors.append("Spreadsheet must contain a " + "'Differentiated cell line' or 'Undifferentiated product' " + "or 'Differentiated product' sheet.") + """Parse the spreadsheet into different sections.""" expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( - 'Cell line', self.action, self.validation_errors + cell_line_sheet_name, self.action, self.validation_errors ) + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.validation_errors + differentiated_cell_line_sheet_name, self.action, self.validation_errors ) + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, self.validation_errors) sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) return { diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index 50eb4ea..5285522 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -1,3 +1,4 @@ +import hashlib import os import filetype @@ -9,6 +10,16 @@ from ait.commons.util.progress_bar import ProgressBar +def compute_md5(file_path): + """Compute the MD5 hash of the file.""" + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + class CmdUpload: """ admin and user @@ -21,6 +32,8 @@ def __init__(self, aws, args): self.files = [] def upload_file(self, selected_area, data_file, destination_file): + hash_md5 = compute_md5(data_file) + print(f"MD5 hash of {data_file} is {hash_md5}") overwrite = getattr(self.args, 'o', False) file_size = os.path.getsize(data_file) @@ -45,7 +58,9 @@ def upload_file(self, selected_area, data_file, destination_file): s3.Bucket(selected_area).upload_file(Filename=data_file, Key=destination_file, Callback=ProgressBar(target=data_file, total=file_size), - ExtraArgs={'ContentType': content_type} + ExtraArgs={'ContentType': content_type, + 'Metadata': {'md5': hash_md5} + } ) def upload_files(self, data_files, prefix): From 15b5e900ec488f4eabf5b4c3b506bd7031b207df Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 4 Sep 2024 16:01:39 +0100 Subject: [PATCH 44/55] better error handling --- ait/commons/util/command/submit.py | 493 ++++++++++++++---------- ait/commons/util/command/submit_file.py | 309 +++++++++------ ait/commons/util/spreadsheet_util.py | 285 ++++++++++++-- 3 files changed, 734 insertions(+), 353 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 0905775..6d0b5c5 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -7,6 +7,7 @@ import numpy as np from urllib.parse import urlparse +from ait.commons.util.spreadsheet_util import SubmissionError from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider @@ -216,9 +217,15 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, - submission_envelope_id, dataset_id, - access_token, action, errors): + def handle_cell_line(self, + cell_line, + expression_alterations, + cell_lines_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -234,22 +241,34 @@ def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) - if success: - print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") - update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) + if success: + print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") + update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') + return cell_line.id + else: + errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") - return cell_line.id + raise SubmissionError(errors, e) else: - cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, - submission_envelope_id, dataset_id, access_token) - update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') - return cell_line_entity_id - - def create_cell_line_entity(self, cell_line, expression_alterations, submission_envelope_id, + try: + cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, + submission_envelope_id, dataset_id, access_token) + update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') + return cell_line_entity_id + except Exception as e: + errors.append(f"Failed to create cell line: {cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_cell_line_entity(self, + cell_line, + expression_alterations, + submission_envelope_id, dataset_id, access_token): """ @@ -284,7 +303,10 @@ def create_cell_line_entity(self, cell_line, expression_alterations, submission_ return cell_line_entity_id - def link_cell_line_with_expression_alterations(self, access_token, cell_line, cell_line_entity_id, + def link_cell_line_with_expression_alterations(self, + access_token, + cell_line, + cell_line_entity_id, expression_alterations): for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: @@ -297,9 +319,15 @@ def link_cell_line_with_expression_alterations(self, access_token, cell_line, ce expression_alteration.id, 'processes', access_token ) - def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, - differentiated_cell_lines_df, submission_envelope_id, dataset_id, - access_token, action, errors): + def handle_differentiated_cell_line(self, + cell_line_entity_id, + differentiated_cell_line, + differentiated_cell_lines_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles a single differentiated cell line associated with a given cell line. @@ -314,32 +342,48 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', differentiated_cell_line.id, - differentiated_cell_line.to_dict(), - access_token) - if success: - print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " - f"{differentiated_cell_line.biomaterial_id}") - - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, - differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), + access_token) + if success: + print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') + return differentiated_cell_line.id + else: + errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") - return differentiated_cell_line.id + raise SubmissionError(errors, e) + else: - differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, cell_line_entity_id, - dataset_id, - differentiated_cell_line, - submission_envelope_id) - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, - differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') - return differentiated_cell_line_id - - def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_id, dataset_id, - differentiated_cell_line, submission_envelope_id): + try: + differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id) + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') + return differentiated_cell_line_id + except Exception as e: + errors.append(f"Failed to create differentiated cell line: {differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_differentiated_cell_line_entity(self, + access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id): """ Creates a Differentiated Cell Line entity and links it to the submission envelope. @@ -401,8 +445,14 @@ def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_ return differentiated_entity_id - def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_entity_id, differentiated_entity_id, - dataset_id, submission_envelope_id, action): + def link_cell_line_and_differentiated_cell_line(self, + access_token, + cell_line_entity_id, + differentiated_entity_id, + dataset_id, + submission_envelope_id, + action, + errors): """ Creates and links the differentiation process between the original cell line and the differentiated cell line. @@ -425,40 +475,52 @@ def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_en The ID of the differentiation process entity created. """ if action.lower() != 'modify': - print(f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " - f"to link them") - - # Create a differentiation process entity - differentiation_process_entity_id = self.create_process( - access_token, - dataset_id, - get_process_content('differentiation'), - submission_envelope_id - ) - - print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") - - # Link the cell line entity as input to the differentiation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") - - # Link the differentiated cell line entity as derived by the differentiation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - return differentiation_process_entity_id - - def handle_library_preparation(self, differentiated_entity_id, library_preparation, - library_preparations_df, submission_envelope_id, - dataset_id, access_token, action, errors): + try: + print( + f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"to link them") + + # Create a differentiation process entity + differentiation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id + ) + + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + + # Link the cell line entity as input to the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") + + # Link the differentiated cell line entity as derived by the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + return differentiation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Cell line {cell_line_entity_id} and Differentiated cell line {differentiated_entity_id}") + raise SubmissionError(errors, e) + + def handle_library_preparation(self, + differentiated_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles a single library preparation associated with a given differentiated cell line. @@ -472,33 +534,46 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', library_preparation.id, - library_preparation.to_dict(), - access_token) - if success: - print(f"Updated library preparation biomaterial: {library_preparation.id} / " - f"{library_preparation.biomaterial_id}") - - update_dataframe(library_preparations_df, library_preparation.id, - library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) + if success: + print(f"Updated library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + + update_dataframe(library_preparations_df, library_preparation.id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') + return library_preparation.id + else: + errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") - - return library_preparation.id + raise SubmissionError(errors, e) else: - library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, - differentiated_entity_id, - library_preparation, - submission_envelope_id) - update_dataframe(library_preparations_df, library_preparation_entity_id, - library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') + try: + library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, + differentiated_entity_id, + library_preparation, + submission_envelope_id) + update_dataframe(library_preparations_df, library_preparation_entity_id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') - return library_preparation_entity_id + return library_preparation_entity_id + except Exception as e: + errors.append(f"Failed to create library preparation biomaterial: {library_preparation.biomaterial_id}") + raise SubmissionError(errors, e) - def create_library_preparation_entity(self, access_token, dataset_id, differentiated_entity_id, library_preparation, + def create_library_preparation_entity(self, + access_token, + dataset_id, + differentiated_entity_id, + library_preparation, submission_envelope_id): """ Creates a Library Preparation entity for the Differentiated Cell Line and links it to the submission envelope and dataset. @@ -566,7 +641,8 @@ def link_differentiated_and_library_preparation(self, library_preparation_entity_id, dataset_id, submission_envelope_id, - action): + action, + errors): """ Links the Differentiated Cell Line to the Library Preparation through a library preparation process. @@ -589,38 +665,45 @@ def link_differentiated_and_library_preparation(self, The ID of the library preparation process entity created. """ if action.lower() != 'modify': - print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " - f"preparation process to link them") - - # Create a library preparation process entity - library_preparation_process_entity_id = self.create_process( - access_token, - dataset_id, - get_process_content('library_preparation'), - submission_envelope_id - ) - - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " - f"preparation process") - - # Link the differentiated cell line entity as input to the library preparation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " - f"preparation process") - - # Link the library preparation entity as derived by the library preparation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - return library_preparation_process_entity_id + try: + print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + f"preparation process to link them") + + # Create a library preparation process entity + library_preparation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " + f"preparation process") + + # Link the differentiated cell line entity as input to the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " + f"preparation process") + + # Link the library preparation entity as derived by the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + return library_preparation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Differentiated Cell line " + f"{differentiated_entity_id} and Library preparation" + f" {library_preparation_entity_id}") + raise SubmissionError(errors, e) def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, dataset_id, @@ -637,32 +720,41 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('file', sequencing_file.id, - sequencing_file.to_dict(), - access_token) - - if success: - print(f"Updated sequencing file: {sequencing_file.id} / " - f"{sequencing_file.file_name}") - - update_dataframe(sequencing_file_df, sequencing_file.id, - sequencing_file.file_name, - 'sequence_file.file_core.file_name') - else: + try: + success = self.patch_entity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) + + if success: + print(f"Updated sequencing file: {sequencing_file.id} / " + f"{sequencing_file.file_name}") + + update_dataframe(sequencing_file_df, sequencing_file.id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') + return sequencing_file.id + else: + errors.append( + f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") - - return sequencing_file.id + raise SubmissionError(errors, e) else: - sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, - dataset_id, - library_preparation_entity_id, - sequencing_file, - submission_envelope_id) - update_dataframe(sequencing_file_df, sequencing_file_entity_id, - sequencing_file.file_name, - 'sequence_file.file_core.file_name') + try: + sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, + dataset_id, + library_preparation_entity_id, + sequencing_file, + submission_envelope_id) + update_dataframe(sequencing_file_df, sequencing_file_entity_id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') - return sequencing_file_entity_id + return sequencing_file_entity_id + except Exception as e: + errors.append(f"Failed to create Sequencing file: {sequencing_file.file_name}") + raise SubmissionError(errors, e) def create_sequencing_file_entity(self, access_token, dataset_id, library_preparation_entity_id, sequencing_file, submission_envelope_id): @@ -711,7 +803,8 @@ def link_library_preparation_and_sequencing_file(self, sequencing_file_entity_id, dataset_id, submission_envelope_id, - action): + action, + errors): """ Links the Library Preparation to the Sequencing File through a sequencing process. @@ -734,34 +827,41 @@ def link_library_preparation_and_sequencing_file(self, The ID of the sequencing process entity created. """ if action.lower() != 'modify': - print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." - f"Creating sequencing process to link the sequencing file") - - # Create a sequencing process entity - sequencing_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('sequencing'), - submission_envelope_id) - - print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") - - # Link the library preparation entity as input to the sequencing process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - print( - f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") - - # Link the sequencing file entity as derived by the sequencing process - self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - return sequencing_process_entity_id + try: + print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." + f"Creating sequencing process to link the sequencing file") + + # Create a sequencing process entity + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) + + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + + # Link the library preparation entity as input to the sequencing process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + + # Link the sequencing file entity as derived by the sequencing process + self.perform_hal_linkage( + f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + return sequencing_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Library Preparation " + f"{library_preparation_entity_id} and Sequencing file" + f" {sequencing_file_entity_id}") + raise SubmissionError(errors, e) def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -818,7 +918,8 @@ def establish_links(self, differentiated_cell_line.id, dataset_id, submission_envelope_id, - action) + action, + errors) for differentiated_cell_line in differentiated_cell_lines: for library_preparation in library_preparations: if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: @@ -828,7 +929,8 @@ def establish_links(self, library_preparation.id, dataset_id, submission_envelope_id, - action) + action, + errors) for library_preparation in library_preparations: for sequencing_file in sequencing_files: @@ -838,18 +940,19 @@ def establish_links(self, sequencing_file.id, dataset_id, submission_envelope_id, - action) + action, + errors) message = 'SUCCESS' except Exception as e: message = f"An error occurred: {str(e)}" errors.append(message) - traceback.print_exc() + raise SubmissionError(message, e) # Set DataFrames to None in case of an error - cell_lines_df = None - differentiated_cell_lines_df = None - library_preparations_df = None - sequencing_files_df = None + # cell_lines_df = None + # differentiated_cell_lines_df = None + # library_preparations_df = None + # sequencing_files_df = None return ([cell_lines_df, differentiated_cell_lines_df, diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index aaf8f7e..9079a4e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -13,11 +13,14 @@ from ait.commons.util.provider_api_util import APIProvider from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ - merge_differentiated_cell_line_and_library_preparation + merge_differentiated_cell_line_and_library_preparation, SubmissionError # Define a class for handling submission of a command file -def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, dataset, errors): +def validate_sequencing_files(sequencing_files, + list_of_files_in_upload_area, + dataset, + errors): for sequencing_file in sequencing_files: match_found = False # Flag to indicate if a match is found @@ -37,7 +40,9 @@ def get_content(unique_value): return {"content": unique_value} -def _create_expression_alterations(submission_instance, submission_envelope_id, access_token, +def _create_expression_alterations(submission_instance, + submission_envelope_id, + access_token, expression_alterations, expression_alterations_df): expression_alterations_entity_id_column_name = "Id" @@ -64,7 +69,7 @@ def _create_expression_alterations(submission_instance, submission_envelope_id, expression_alterations_entity_id_column_name ] = expression_alteration_id - return expression_alterations, expression_alterations_df + return expression_alterations class CmdSubmitFile: @@ -136,16 +141,16 @@ def run(self): self._process_submission(submission_instance, list_of_files_in_upload_area) return True, "SUBMISSION IS SUCCESSFUL." except Exception as e: - return self.delete_actions(self.submission_envelope_id, submission_instance, e) + return self._delete_actions(self.submission_envelope_id, submission_instance, e) except KeyboardInterrupt: # Handle the interruption and exit gracefully print("\nProcess interrupted by user. Exiting gracefully...") - self.delete_actions(self.submission_envelope_id, submission_instance, None) + self._delete_actions(self.submission_envelope_id, submission_instance, None) sys.exit(0) # Exit with a zero status code indicating a clean exit except Exception as e: # Handle any other unexpected exceptions print(f"An unexpected error occurred: {str(e)}") - self.delete_actions(self.submission_envelope_id, submission_instance, None) + self._delete_actions(self.submission_envelope_id, submission_instance, None) sys.exit(1) # Exit with a non-zero status code indicating an error def _is_delete_action(self): @@ -164,137 +169,177 @@ def _list_files_in_upload_area(self): return list_instance.list_bucket_contents_and_return(self.dataset, '') def _process_submission(self, submission_instance, list_of_files_in_upload_area): - """Process the file submission.""" - parser = SpreadsheetSubmitter(self.file) - parsed_data = self._parse_spreadsheet(parser) - self._validate_and_upload(parsed_data, list_of_files_in_upload_area) - expression_alterations = parsed_data['expression_alterations'] - expression_alterations_df = parsed_data['expression_alterations_df'] - parent_cell_line_name = parsed_data['parent_cell_line_name'] - cell_lines = parsed_data['cell_lines'] - cell_lines_df = parsed_data['cell_lines_df'] - differentiated_cell_lines = parsed_data['differentiated_cell_lines'] - differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] - library_preparations = parsed_data['library_preparations'] - library_preparations_df = parsed_data['library_preparations_df'] - sequencing_files = parsed_data['sequencing_files'] - sequencing_files_df = parsed_data['sequencing_files_df'] - - # TODO: Handle expression alterations in MODIFY - created_expression_alterations = [] - - if self._is_add_action(): - self._create_submission_envelope() - - parent_cell_line_id = None - - if parent_cell_line_name is not None: - print(f"Creating parental cell line with name {parent_cell_line_name}") - parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) - # TODO: link parental cell line with dataset - - print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") - - if expression_alterations and expression_alterations_df is not None: - created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( - submission_instance, expression_alterations, expression_alterations_df) + try: + """Process the file submission.""" + parser = SpreadsheetSubmitter(self.file) + parsed_data = self._parse_spreadsheet(parser) + self._validate_and_upload(parsed_data, list_of_files_in_upload_area) + + # Extract parsed data + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] + parent_cell_line_name = parsed_data['parent_cell_line_name'] + cell_lines = parsed_data['cell_lines'] + cell_lines_df = parsed_data['cell_lines_df'] + differentiated_cell_lines = parsed_data['differentiated_cell_lines'] + differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + library_preparations = parsed_data['library_preparations'] + library_preparations_df = parsed_data['library_preparations_df'] + sequencing_files = parsed_data['sequencing_files'] + sequencing_files_df = parsed_data['sequencing_files_df'] + + # Initialize lists for created entities + created_expression_alterations = [] + created_cell_lines = [] + created_differentiated_cell_lines = [] + created_library_preparations = [] + created_sequencing_files = [] - if created_expression_alterations and parent_cell_line_id is not None: - self.link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + if self._is_add_action(): + self._create_submission_envelope() + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + created_expression_alterations = self._handle_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df, parent_cell_line_id ) - created_cell_lines = [] - - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df, created_expression_alterations) - - created_differentiated_cell_lines = [] - - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) - - created_library_preparations = [] - - if library_preparations and library_preparations_df is not None: - created_library_preparations, library_preparations_df = self._create_library_preparations( - submission_instance, library_preparations, library_preparations_df) - - created_sequencing_files = [] - - if sequencing_files and sequencing_files_df is not None: - created_sequencing_files, sequencing_files_df = self._create_sequencing_files( - submission_instance, sequencing_files, sequencing_files_df) - - updated_dfs, message = self._establish_links(submission_instance, - created_cell_lines, - cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, - created_library_preparations, - library_preparations_df, - created_sequencing_files, - sequencing_files_df) - - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if cell_lines and cell_lines_df is not None: + created_cell_lines = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df, created_expression_alterations) + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + if library_preparations and library_preparations_df is not None: + created_library_preparations = self._create_library_preparations( + submission_instance, library_preparations, library_preparations_df) + + if sequencing_files and sequencing_files_df is not None: + created_sequencing_files = self._create_sequencing_files( + submission_instance, sequencing_files, sequencing_files_df) + + updated_dfs, message = self._establish_links(submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self._delete_actions(self.submission_envelope_id, submission_instance, None) + except ValidationError as e: + print(f"Validation Error: {e.errors}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except SubmissionError as e: + print(f"Submission Error: {e.errors}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except Exception as e: + print(f"An unexpected error occurred during submission processing: {e}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + raise e # Re-raise the exception to propagate it upwards + + def _handle_parent_cell_line(self, submission_instance, parent_cell_line_name): + """Handles the creation of a parent cell line.""" + parent_cell_line_id = None + if parent_cell_line_name: + print(f"Creating parental cell line with name {parent_cell_line_name}") + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + return parent_cell_line_id + + def _handle_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_id): + """Handles the creation of expression alterations and links them to the parent cell line if needed.""" + created_expression_alterations = [] + if expression_alterations and expression_alterations_df is not None: + created_expression_alterations = self._submit_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df + ) + if created_expression_alterations and parent_cell_line_id: + self._link_parent_cell_line_expression_alteration( + submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + ) + return created_expression_alterations def _parse_spreadsheet(self, parser): try: + # Determine the necessary sheet names tab_names = parser.list_sheets() - cell_line_sheet_name = None - differentiated_cell_line_sheet_name = None + cell_line_sheet_name = next( + (name for name in ["Cell line", "Clonal cell line"] if name in tab_names), None + ) + differentiated_cell_line_sheet_name = next( + (name for name in ["Differentiated cell line", "Differentiated product"] if name in tab_names), None + ) + undifferentiated_cell_line_sheet_name = ( + "Undifferentiated product" if "Undifferentiated product" in tab_names else None + ) - if "Cell line" in tab_names: - cell_line_sheet_name = "Cell line" - elif "Clonal cell line" in tab_names: - cell_line_sheet_name = "Clonal cell line" - else: - self.validation_errors.append("Spreadsheet must contain a " - "'Cell line' or 'Clonal cell line' sheet.") + undifferentiated_cell_lines = [] + undifferentiated_cell_lines_df = None - if "Differentiated cell line" in tab_names: - differentiated_cell_line_sheet_name = "Differentiated cell line" - # elif "Undifferentiated product" in tab_names: - # differentiated_cell_line_sheet_name = "Undifferentiated product" - elif "Differentiated product" in tab_names: - differentiated_cell_line_sheet_name = "Differentiated product" - else: + # Validate the presence of required sheets + if not cell_line_sheet_name: self.validation_errors.append("Spreadsheet must contain a " - "'Differentiated cell line' or 'Undifferentiated product' " - "or 'Differentiated product' sheet.") + "'Cell line' or 'Clonal cell line' sheet.") + if not (differentiated_cell_line_sheet_name or undifferentiated_cell_line_sheet_name): + self.validation_errors.append( + "Spreadsheet must contain a " + "'Differentiated cell line', 'Undifferentiated product', " + "or 'Differentiated product' sheet." + ) - """Parse the spreadsheet into different sections.""" + # Parse different sections of the spreadsheet expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) - cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( cell_line_sheet_name, self.action, self.validation_errors ) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( differentiated_cell_line_sheet_name, self.action, self.validation_errors ) - merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) + if undifferentiated_cell_line_sheet_name: + undifferentiated_cell_lines, undifferentiated_cell_lines_df = parser.get_undifferentiated_cell_lines( + undifferentiated_cell_line_sheet_name, self.action, self.validation_errors + ) + + # Check for errors and merge data + if differentiated_cell_lines and undifferentiated_cell_lines: + self.validation_errors.append( + "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/products" + ) + + if differentiated_cell_lines: + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, + self.validation_errors) + if undifferentiated_cell_lines: + merge_cell_line_and_differentiated_cell_line(cell_lines, undifferentiated_cell_lines, + self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) - - merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, - self.validation_errors) + merge_differentiated_cell_line_and_library_preparation( + differentiated_cell_lines, library_preparations, self.validation_errors + ) sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) - merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + # Return the parsed data as a dictionary return { "expression_alterations": expression_alterations, "expression_alterations_df": expression_alterations_df, @@ -303,6 +348,8 @@ def _parse_spreadsheet(self, parser): "parent_cell_line_name": parent_cell_line_name, "differentiated_cell_lines": differentiated_cell_lines, "differentiated_cell_lines_df": differentiated_cell_lines_df, + "undifferentiated_cell_lines": undifferentiated_cell_lines, + "undifferentiated_cell_lines_df": undifferentiated_cell_lines_df, "library_preparations": library_preparations, "library_preparations_df": library_preparations_df, "sequencing_files": sequencing_files, @@ -342,7 +389,7 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): sys.exit(1) else: # Print the error message - print(e) + print(f"Validation Error: {e.errors}") # Exit the program with a non-zero status code to indicate an error sys.exit(1) @@ -369,7 +416,7 @@ def _create_submission_envelope(self): self.submission_envelope_id = get_id_from_url(submission_envelope_response['_links']['self']['href']) print(f"Submission envelope for this submission is: {self.submission_envelope_id}") else: - raise Exception(f"Failed to create submission envelope. Status code: {status_code}") + raise SubmissionError(f"Failed to create submission envelope. Status code: {status_code}") def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): """Submit the parent cell line.""" @@ -378,14 +425,21 @@ def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): self.submission_envelope_id, self.access_token ) - def _submit_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df): + def _submit_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df): """Submit expression alterations.""" return _create_expression_alterations( submission_instance, self.submission_envelope_id, self.access_token, expression_alterations, expression_alterations_df ) - def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, expression_alterations): + def _create_cell_lines(self, + submission_instance, + cell_lines, + cell_lines_df, + expression_alterations): for cell_line in cell_lines: cell_line_entity_id = submission_instance.handle_cell_line(cell_line, expression_alterations, cell_lines_df, self.submission_envelope_id, self.dataset, @@ -393,9 +447,11 @@ def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, exp self.submission_errors) cell_line.id = cell_line_entity_id - return cell_lines, cell_lines_df + return cell_lines - def _create_differentiated_cell_lines(self, submission_instance, differentiated_cell_lines, + def _create_differentiated_cell_lines(self, + submission_instance, + differentiated_cell_lines, differentiated_cell_lines_df): for differentiated_cell_line in differentiated_cell_lines: differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, @@ -408,9 +464,11 @@ def _create_differentiated_cell_lines(self, submission_instance, differentiated_ self.submission_errors) differentiated_cell_line.id = differentiated_cell_line_entity_id - return differentiated_cell_lines, differentiated_cell_lines_df + return differentiated_cell_lines - def _create_library_preparations(self, submission_instance, library_preparations, + def _create_library_preparations(self, + submission_instance, + library_preparations, library_preparations_df): for library_preparation in library_preparations: library_preparation_entity_id = submission_instance.handle_library_preparation(None, @@ -423,9 +481,11 @@ def _create_library_preparations(self, submission_instance, library_preparations self.submission_errors) library_preparation.id = library_preparation_entity_id - return library_preparations, library_preparations_df + return library_preparations - def _create_sequencing_files(self, submission_instance, sequencing_files, + def _create_sequencing_files(self, + submission_instance, + sequencing_files, sequencing_files_df): for sequencing_file in sequencing_files: sequencing_file_entity_id = submission_instance.handle_sequencing_file(None, @@ -438,7 +498,7 @@ def _create_sequencing_files(self, submission_instance, sequencing_files, self.submission_errors) sequencing_file.id = sequencing_file_entity_id - return sequencing_files, sequencing_files_df + return sequencing_files def _establish_links(self, submission_instance, @@ -497,7 +557,7 @@ def _save_and_upload_results(self, updated_dfs, expression_alteration_df): except Exception as e: print(f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for tracing metadata") - def delete_actions(self, submission_envelope_id, submission_instance, error=None): + def _delete_actions(self, submission_envelope_id, submission_instance, error=None): """Handle actions needed when a submission fails.""" try: if self._is_add_action(): @@ -533,10 +593,11 @@ def _handle_modify_action_failure(self, error): else: return False, "Submission has failed, rolled back" - def link_parent_cell_line_expression_alteration(self, submission_instance, - access_token, - parent_cell_line_id, - created_expression_alterations): + def _link_parent_cell_line_expression_alteration(self, + submission_instance, + access_token, + parent_cell_line_id, + created_expression_alterations): for expression_alteration in created_expression_alterations: print(f"Linking parent cell line {parent_cell_line_id} " f"as input to process of {expression_alteration.expression_alteration_id}") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 28dad58..23e4595 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -1,3 +1,5 @@ +import traceback + import pandas as pd import json import numpy as np @@ -20,10 +22,35 @@ def add_error(self, missing_type, entity_type, missing_id, errors): class ValidationError(Exception): def __init__(self, errors): self.errors = errors - super().__init__("Validation errors occurred") + super().__init__(self._format_message()) + + def _format_message(self): + # This method formats the error message that will be displayed when the exception is raised. + return "Validation errors occurred:\n" + "\n".join(self.errors) - def __str__(self): - return "\n".join(self.errors) + +class SubmissionError(Exception): + """ + Exception raised for errors during submission. + Includes a list of errors and an optional underlying exception. + """ + + def __init__(self, errors, original_exception=None): + self.errors = errors + self.original_exception = original_exception # Store the original exception + super().__init__(self._format_message()) + + def _format_message(self): + """ + Format the error message to include both the list of submission errors and details of the original exception. + """ + message = "Submission errors occurred:\n" + "\n".join(self.errors) + if self.original_exception: + message += "\n\nOriginal Exception Details:\n" + message += f"Type: {type(self.original_exception).__name__}\n" + message += f"Message: {str(self.original_exception)}\n" + message += "Stack Trace:\n" + "".join(traceback.format_tb(self.original_exception.__traceback__)) + return message """ @@ -37,8 +64,16 @@ def __init__(self, type, id): class CellLine: - def __init__(self, biomaterial_id, description, derived_from_accession, - clone_id, protocol_id, zygosity, cell_type, expression_alteration_id, id): + def __init__(self, + biomaterial_id, + description, + derived_from_accession, + clone_id, + protocol_id, + zygosity, + cell_type, + expression_alteration_id, + id): self.biomaterial_id = biomaterial_id self.description = description self.derived_from_accession = derived_from_accession @@ -81,9 +116,19 @@ def to_dict(self): class ExpressionAlterationStrategy: - def __init__(self, expression_alteration_id, protocol_id, allele_specific, altered_gene_symbols, altered_gene_ids, - targeted_genomic_region, expected_alteration_type, sgrna_target, - protocol_method_text, altered_locus, guide_sequence, id): + def __init__(self, + expression_alteration_id, + protocol_id, + allele_specific, + altered_gene_symbols, + altered_gene_ids, + targeted_genomic_region, + expected_alteration_type, + sgrna_target, + protocol_method_text, + altered_locus, + guide_sequence, + id): self.expression_alteration_id = expression_alteration_id self.protocol_id = protocol_id self.allele_specific = allele_specific @@ -120,8 +165,16 @@ def to_dict(self): class DifferentiatedCellLine: - def __init__(self, biomaterial_id, description, input_biomaterial_id, protocol_id, timepoint_value, timepoint_unit, - terminally_differentiated, model_system, id): + def __init__(self, + biomaterial_id, + description, + input_biomaterial_id, + protocol_id, + timepoint_value, + timepoint_unit, + terminally_differentiated, + model_system, + id): self.biomaterial_id = biomaterial_id self.description = description self.input_biomaterial_id = input_biomaterial_id @@ -162,10 +215,21 @@ def to_dict(self): class LibraryPreparation: - def __init__(self, biomaterial_id, protocol_id, dissociation_protocol_id, differentiated_biomaterial_id, - average_fragment_size, input_amount_value, input_amount_unit, - final_yield_value, final_yield_unit, concentration_value, concentration_unit, - pcr_cycles, pcr_cycles_for_sample_index, id): + def __init__(self, + biomaterial_id, + protocol_id, + dissociation_protocol_id, + differentiated_biomaterial_id, + average_fragment_size, + input_amount_value, + input_amount_unit, + final_yield_value, + final_yield_unit, + concentration_value, + concentration_unit, + pcr_cycles, + pcr_cycles_for_sample_index, + id): self.biomaterial_id = biomaterial_id self.protocol_id = protocol_id self.dissociation_protocol_id = dissociation_protocol_id @@ -226,8 +290,17 @@ class EntityType: class SequencingFile: - def __init__(self, file_name, extension, read_index, lane_index=None, read_length=None, checksum=None, - library_preparation_id=None, sequencing_protocol_id=None, run_id=None, id=None): + def __init__(self, + file_name, + extension, + read_index, + lane_index=None, + read_length=None, + checksum=None, + library_preparation_id=None, + sequencing_protocol_id=None, + run_id=None, + id=None): self.file_name = file_name self.extension = extension self.read_index = read_index @@ -272,8 +345,13 @@ def convert_to_valid_json_value(value): } -def find_orphans(source_entities, target_entities, - source_attr, target_attr, source_type, target_type, errors): +def find_orphans(source_entities, + target_entities, + source_attr, + target_attr, + source_type, + target_type, + errors): """ Validates that each source entity has a corresponding target entity. @@ -303,7 +381,9 @@ def find_orphans(source_entities, target_entities, # print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") -def merge_library_preparation_sequencing_file(library_preparations, sequencing_files, errors): +def merge_library_preparation_sequencing_file(library_preparations, + sequencing_files, + errors): """ Merges library preparations and sequencing files based on their IDs. @@ -349,7 +429,8 @@ def merge_library_preparation_sequencing_file(library_preparations, sequencing_f library_preparation.add_sequencing_file(sequencing_file) -def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, +def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, + library_preparations, errors): """ Merges differentiated cell lines and library preparations based on their biomaterial IDs. @@ -399,7 +480,8 @@ def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_l def merge_cell_line_and_differentiated_cell_line(cell_lines, - differentiated_cell_lines, errors): + differentiated_cell_lines, + errors): """ Merges cell lines and differentiated cell lines based on their biomaterial IDs. @@ -537,11 +619,14 @@ def input_file_to_data_frames(self, sheet_name, action): df = pd.read_excel(self.file_path, sheet_name=sheet_names[trimmed_sheet_name], engine='openpyxl', skiprows=skip_rows) else: - raise ValueError(f"Sheet '{sheet_name}' not found in the spreadsheet.") + raise ValidationError(f"Sheet '{sheet_name}' not found in the spreadsheet.") return df - def parse_cell_lines(self, sheet_name, action, errors): + def parse_cell_lines(self, + sheet_name, + action, + errors): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -627,7 +712,10 @@ def parse_cell_lines(self, sheet_name, action, errors): return cell_lines, df_filtered, parent_cell_line_names[0] - def parse_differentiated_cell_lines(self, sheet_name, action, errors): + def parse_differentiated_cell_lines(self, + sheet_name, + action, + errors): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -651,8 +739,8 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: - errors.append("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - "exist.") + errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + f"exist in {sheet_name}.") return [], df # Filter rows where biomaterial_id is not null @@ -705,7 +793,91 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, action, errors): + def parse_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + + # Check if the required column exists + if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: + errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + f"exist in {sheet_name}.") + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + # Check for mandatory fields and create Differentiated CellLine objects + undifferentiated_cell_lines = [] + + for _, row in df_filtered.iterrows(): + differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] + biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') + + # Check if biomaterial_id is null + if pd.isnull(differentiated_biomaterial_id): + errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " + "sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") + + # Check if derived_accession and cell_type are present + if pd.isnull(biomaterial_id): + errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " + f"{differentiated_biomaterial_id}") + """ + raise MissingMandatoryFieldError( + "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + """ + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + undifferentiated_cell_lines.append( + DifferentiatedCellLine( + biomaterial_id=differentiated_biomaterial_id, + description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), + input_biomaterial_id=biomaterial_id, + protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), + timepoint_value=row.get('differentiated_cell_line.timepoint_value'), + timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), + terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), + model_system=row.get('differentiated_cell_line.model_organ.text'), + id=row.get('Id') + ) + ) + + return undifferentiated_cell_lines, df_filtered + + def parse_library_preparations(self, + sheet_name, + action, + errors): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -795,7 +967,10 @@ def parse_library_preparations(self, sheet_name, action, errors): return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, action, errors): + def parse_sequencing_files(self, + sheet_name, + action, + errors): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -884,7 +1059,10 @@ def parse_sequencing_files(self, sheet_name, action, errors): return sequencing_files, df_filtered - def parse_expression_alteration(self, sheet_name, action, errors): + def parse_expression_alteration(self, + sheet_name, + action, + errors): """ Parses data related to expression alterations from a specified sheet in the Excel file. @@ -965,7 +1143,10 @@ def parse_expression_alteration(self, sheet_name, action, errors): # Return the list of objects, the filtered DataFrame, and a flag indicating success return expression_alterations, df_filtered - def get_cell_lines(self, sheet_name, action, errors): + def get_cell_lines(self, + sheet_name, + action, + errors): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -984,7 +1165,10 @@ def get_cell_lines(self, sheet_name, action, errors): cell_lines, cell_lines_df, parent_cell_line_name = self.parse_cell_lines(sheet_name, action, errors) return cell_lines, cell_lines_df, parent_cell_line_name - def get_differentiated_cell_lines(self, sheet_name, action, errors): + def get_differentiated_cell_lines(self, + sheet_name, + action, + errors): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -1004,7 +1188,34 @@ def get_differentiated_cell_lines(self, sheet_name, action, errors): action, errors) return differentiated_cell_lines, differentiated_cell_lines_df - def get_library_preparations(self, sheet_name, action, errors): + def get_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + undifferentiated_cell_lines, undifferentiated_cell_lines_df = self.parse_undifferentiated_cell_lines(sheet_name, + action, + errors) + return undifferentiated_cell_lines, undifferentiated_cell_lines_df + + def get_library_preparations(self, + sheet_name, + action, + errors): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -1024,7 +1235,10 @@ def get_library_preparations(self, sheet_name, action, errors): action, errors) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, action, errors): + def get_sequencing_files(self, + sheet_name, + action, + errors): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -1043,6 +1257,9 @@ def get_sequencing_files(self, sheet_name, action, errors): sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, errors) return sequencing_files, df_filtered - def get_expression_alterations(self, sheet_name, action, errors): + def get_expression_alterations(self, + sheet_name, + action, + errors): expression_alterations, df_filtered = self.parse_expression_alteration(sheet_name, action, errors) return expression_alterations, df_filtered From f15c6f2a691849ce20b5a7e23b5052e4d2e29619 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 5 Sep 2024 14:51:51 +0100 Subject: [PATCH 45/55] better error handling --- ait/commons/util/command/submit_file.py | 7 ++++--- ait/commons/util/spreadsheet_util.py | 15 ++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 9079a4e..405d9e1 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -234,7 +234,7 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) return self._delete_actions(self.submission_envelope_id, submission_instance, None) except ValidationError as e: print(f"Validation Error: {e.errors}") - self._delete_actions(self.submission_envelope_id, submission_instance, e) + # self._delete_actions(self.submission_envelope_id, submission_instance, e) sys.exit(1) except SubmissionError as e: print(f"Submission Error: {e.errors}") @@ -389,9 +389,10 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): sys.exit(1) else: # Print the error message - print(f"Validation Error: {e.errors}") + # print(f"Validation Error: {e.errors}") # Exit the program with a non-zero status code to indicate an error - sys.exit(1) + # sys.exit(1) + raise ValidationError(self.validation_errors) print(f"File {self.file} is validated successfully. Initiating submission") print(f"File {self.file} being uploaded to storage") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 23e4595..8c050a6 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -649,8 +649,8 @@ def parse_cell_lines(self, # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append( - "The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the Cell line sheet. " - "The rest of the file will not be processed") + f"The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -740,7 +740,7 @@ def parse_differentiated_cell_lines(self, # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - f"exist in {sheet_name}.") + f"exist in {sheet_name} name. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -821,7 +821,7 @@ def parse_undifferentiated_cell_lines(self, # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - f"exist in {sheet_name}.") + f"exist in {sheet_name}. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -906,7 +906,8 @@ def parse_library_preparations(self, for col in required_columns: if col not in df.columns: - errors.append(f"The column '{col}' does not exist in the Library Preparation sheet.") + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df @@ -1001,7 +1002,8 @@ def parse_sequencing_files(self, for col in required_columns: if col not in df.columns: - errors.append(f"The column '{col}' does not exist in the Sequencing File sheet.") + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df @@ -1104,7 +1106,6 @@ def parse_expression_alteration(self, # Filter rows where 'expression_alteration_id' is not null df = df[df['expression_alteration_id'].notna()] - # Replace invalid float values (e.g., NaN, infinite) with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) From 24d946f1681d03f9cca9b8c6c6ee389196498367 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 6 Sep 2024 17:18:23 +0100 Subject: [PATCH 46/55] correct name for expression_alteration_id while object construction --- ait/commons/util/spreadsheet_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 8c050a6..ff9fec2 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -148,7 +148,7 @@ def __repr__(self): def to_dict(self): return { "content": { - "label": self.expression_alteration_id, + "expression_alteration_label": self.expression_alteration_id, "protocol_id": self.protocol_id, "allele_specific": self.allele_specific, "altered_gene_symbols": self.altered_gene_symbols, From ea8b62f7535a4e225b94238b4144890e0c04823c Mon Sep 17 00:00:00 2001 From: dgupta Date: Tue, 10 Sep 2024 15:55:35 +0100 Subject: [PATCH 47/55] increment version --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 2655adf..382b391 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '1.0.0' +VERSION = '1.0.1' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 76ff747795e11adac8475f2eb0cad19581e120ea Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 30 Sep 2024 16:56:40 +0100 Subject: [PATCH 48/55] improvements --- ait/commons/util/aws_client.py | 1 - ait/commons/util/command/submit.py | 164 +++++++++++++----------- ait/commons/util/command/submit_file.py | 127 +++++++++++++----- 3 files changed, 189 insertions(+), 103 deletions(-) diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 60deab1..dea6c66 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -1,7 +1,6 @@ import json import boto3 -import botocore from ait.commons.util.aws_cognito_authenticator import AwsCognitoAuthenticator from ait.commons.util.settings import AWS_SECRET_NAME_AK_BUCKET, AWS_SECRET_NAME_SK_BUCKET, \ diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 6d0b5c5..ad17000 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -18,7 +18,7 @@ def matching_expression_alteration_and_cell_line(cell_line, expression_alteratio " ", "").strip() -def get_id_from_url(url): +def get_entity_id_from_hal_link(url): """ Extracts and returns the ID from a given URL. @@ -136,7 +136,7 @@ def post_to_provider_api_and_get_entity_id(url, data, access_token): response_data = response.json() entity_url = response_data['_links']['self']['href'] - return get_id_from_url(entity_url) + return get_entity_id_from_hal_link(entity_url) def post_to_provider_api(url, data_type_in_hal_link, data, access_token): @@ -169,9 +169,9 @@ class CmdSubmit: A class to handle submission of studies, datasets, and biomaterials to a server. Attributes: - base_url (str): The base URL for the server. - submission_envelope_create_url (str): URL for creating submission envelopes. - submission_envelope_base_url (str): Base URL for submission envelopes. + BASE_URL (str): The base URL for the server. + SUBMISSION_ENVELOPE_CREATE_URL (str): URL for creating submission envelopes. + SUBMISSION_ENVELOPE_BASE_URL (str): Base URL for submission envelopes. args (Namespace): Command-line arguments. access_token (str): Access token for authorization. type (str): Type of submission (study, dataset, or biomaterial). @@ -191,9 +191,9 @@ class CmdSubmit: transform(file): Transforms the input file to a JSON object. put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ - base_url = 'https://api.ingest.dev.archive.morphic.bio' - submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" - submission_envelope_base_url = f"{base_url}/submissionEnvelopes" + BASE_URL = 'http://localhost:8080' + SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" + SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" def __init__(self, args): """ @@ -206,7 +206,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.type = getattr(self.args, 'type', None) self.file = getattr(self.args, 'file', None) - self.provider_api = APIProvider(self.base_url) + self.provider_api = APIProvider(self.BASE_URL) def run(self): """ @@ -311,11 +311,11 @@ def link_cell_line_with_expression_alterations(self, for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: if matching_expression_alteration_and_cell_line(cell_line, expression_alteration): - print(f"Linking cell line {cell_line_entity_id} " + print(f"Linking cell line {cell_line.biomaterial_id} " f"as derived by process of {expression_alteration.expression_alteration_id}") self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/derivedByProcesses", expression_alteration.id, 'processes', access_token ) @@ -447,8 +447,8 @@ def create_differentiated_cell_line_entity(self, def link_cell_line_and_differentiated_cell_line(self, access_token, - cell_line_entity_id, - differentiated_entity_id, + cell_line, + differentiated_or_undifferentiated_cell_line, dataset_id, submission_envelope_id, action, @@ -475,9 +475,13 @@ def link_cell_line_and_differentiated_cell_line(self, The ID of the differentiation process entity created. """ if action.lower() != 'modify': + cell_line_biomaterial_id = cell_line.biomaterial_id + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + try: + print( - f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"Cell line {cell_line_biomaterial_id} has differentiated cell lines, creating differentiation process " f"to link them") # Create a differentiation process entity @@ -489,27 +493,31 @@ def link_cell_line_and_differentiated_cell_line(self, ) print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + f"Linking Cell Line Biomaterial: {cell_line_biomaterial_id} as input to process : {differentiation_process_entity_id}") # Link the cell line entity as input to the differentiation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{cell_line.id}/inputToProcesses", differentiation_process_entity_id, 'processes', access_token ) - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + print( + f"Linking Differentiated cell line Biomaterial: " + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as derived by process : {differentiation_process_entity_id}") # Link the differentiated cell line entity as derived by the differentiation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}" + f"/derivedByProcesses", differentiation_process_entity_id, 'processes', access_token ) return differentiation_process_entity_id except Exception as e: errors.append( - f"Failed to update relations between Cell line {cell_line_entity_id} and Differentiated cell line {differentiated_entity_id}") + f"Failed to update relations between Cell line {cell_line_biomaterial_id} " + f"and Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id}") raise SubmissionError(errors, e) def handle_library_preparation(self, @@ -637,8 +645,8 @@ def create_library_preparation_entity(self, def link_differentiated_and_library_preparation(self, access_token, - differentiated_entity_id, - library_preparation_entity_id, + differentiated_or_undifferentiated_cell_line, + library_preparation, dataset_id, submission_envelope_id, action, @@ -665,8 +673,12 @@ def link_differentiated_and_library_preparation(self, The ID of the library preparation process entity created. """ if action.lower() != 'modify': + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + library_preparation_biomaterial_id = library_preparation.biomaterial_id + try: - print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + print(f"Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"has library preparations, creating library " f"preparation process to link them") # Create a library preparation process entity @@ -678,22 +690,22 @@ def link_differentiated_and_library_preparation(self, ) print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " - f"preparation process") + f"Linking Differentiated Cell Line Biomaterial: {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as input to library preparation process") # Link the differentiated cell line entity as input to the library preparation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}/inputToProcesses", library_preparation_process_entity_id, 'processes', access_token ) print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " - f"preparation process") + f"Linking Library Preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as derived by library preparation process") # Link the library preparation entity as derived by the library preparation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/derivedByProcesses", library_preparation_process_entity_id, 'processes', access_token ) @@ -701,8 +713,8 @@ def link_differentiated_and_library_preparation(self, except Exception as e: errors.append( f"Failed to update relations between Differentiated Cell line " - f"{differentiated_entity_id} and Library preparation" - f" {library_preparation_entity_id}") + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} and Library preparation" + f" {library_preparation_biomaterial_id}") raise SubmissionError(errors, e) def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, @@ -799,8 +811,8 @@ def create_sequencing_file_entity(self, access_token, dataset_id, library_prepar def link_library_preparation_and_sequencing_file(self, access_token, - library_preparation_entity_id, - sequencing_file_entity_id, + library_preparation, + sequencing_file, dataset_id, submission_envelope_id, action, @@ -827,9 +839,12 @@ def link_library_preparation_and_sequencing_file(self, The ID of the sequencing process entity created. """ if action.lower() != 'modify': + library_preparation_biomaterial_id = library_preparation.biomaterial_id + sequence_file_name = sequencing_file.file_name + try: - print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." - f"Creating sequencing process to link the sequencing file") + print(f"Library preparation {library_preparation_biomaterial_id} has " + f"generated sequencing files. Creating sequencing process to link the sequencing file") # Create a sequencing process entity sequencing_process_entity_id = self.create_process(access_token, @@ -838,20 +853,22 @@ def link_library_preparation_and_sequencing_file(self, submission_envelope_id) print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + f"Linking Library preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as input to process: {sequencing_process_entity_id}") # Link the library preparation entity as input to the sequencing process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/inputToProcesses", sequencing_process_entity_id, 'processes', access_token ) print( - f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + f"Linking Sequencing file: {sequence_file_name} as derived by process: " + f"{sequencing_process_entity_id}") # Link the sequencing file entity as derived by the sequencing process self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/files/{sequencing_file.id}/derivedByProcesses", sequencing_process_entity_id, 'processes', access_token ) @@ -859,8 +876,8 @@ def link_library_preparation_and_sequencing_file(self, except Exception as e: errors.append( f"Failed to update relations between Library Preparation " - f"{library_preparation_entity_id} and Sequencing file" - f" {sequencing_file_entity_id}") + f"{library_preparation_biomaterial_id} and Sequencing file" + f" {sequence_file_name}") raise SubmissionError(errors, e) def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): @@ -881,8 +898,8 @@ def create_process(self, access_token, dataset_id, process_data, submission_enve def establish_links(self, cell_lines, cell_lines_df, - differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, library_preparations, library_preparations_df, sequencing_files, @@ -911,22 +928,22 @@ def establish_links(self, """ try: for cell_line in cell_lines: - for differentiated_cell_line in differentiated_cell_lines: - if cell_line.biomaterial_id == differentiated_cell_line.input_biomaterial_id: + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: + if cell_line.biomaterial_id == differentiated_or_undifferentiated_cell_line.input_biomaterial_id: self.link_cell_line_and_differentiated_cell_line(access_token, - cell_line.id, - differentiated_cell_line.id, + cell_line, + differentiated_or_undifferentiated_cell_line, dataset_id, submission_envelope_id, action, errors) - for differentiated_cell_line in differentiated_cell_lines: + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: for library_preparation in library_preparations: - if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: + if differentiated_or_undifferentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: self.link_differentiated_and_library_preparation( access_token, - differentiated_cell_line.id, - library_preparation.id, + differentiated_or_undifferentiated_cell_line, + library_preparation, dataset_id, submission_envelope_id, action, @@ -936,8 +953,8 @@ def establish_links(self, for sequencing_file in sequencing_files: if library_preparation.biomaterial_id == sequencing_file.library_preparation_id: self.link_library_preparation_and_sequencing_file(access_token, - library_preparation.id, - sequencing_file.id, + library_preparation, + sequencing_file, dataset_id, submission_envelope_id, action, @@ -955,7 +972,7 @@ def establish_links(self, # sequencing_files_df = None return ([cell_lines_df, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines_df, library_preparations_df, sequencing_files_df], message) @@ -1027,15 +1044,16 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ 'biomaterial': 'biomaterials', 'process': 'processes' } + hal_entity = entity_map.get(input_entity_type) if not hal_entity: return None - entity_create_url = post_to_provider_api(self.submission_envelope_create_url, hal_entity, data, + entity_create_url = post_to_provider_api(self.SUBMISSION_ENVELOPE_CREATE_URL, hal_entity, data, access_token) entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) - entity_id = get_id_from_url(entity_self_hal_link) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -1054,7 +1072,7 @@ def patch_entity(self, input_entity_type, id, data, access_token): if not hal_entity: return False - entity_patch_url = f"{self.base_url}/{hal_entity}/{id}" + entity_patch_url = f"{self.BASE_URL}/{hal_entity}/{id}" return self.patch_to_provider_api(entity_patch_url, data, access_token) def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token): @@ -1068,7 +1086,7 @@ def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token if not hal_entity: return False - put_url = f"{self.base_url}/datasets/{dataset_id}/{hal_entity}/{entity_id}" + put_url = f"{self.BASE_URL}/datasets/{dataset_id}/{hal_entity}/{entity_id}" return self.provider_api.put(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): @@ -1105,9 +1123,9 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submi if not hal_entity: return None - entity_create_url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/{hal_entity}" + entity_create_url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/{hal_entity}" entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) - entity_id = get_id_from_url(entity_self_hal_link) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -1124,7 +1142,7 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): """ print(f"Linking dataset {dataset_id} to study {study_id}") - url = f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}" + url = f"{self.BASE_URL}/studies/{study_id}/datasets/{dataset_id}" self.provider_api.put(url, access_token) print(f"Dataset linked successfully to study: {study_id}") @@ -1140,7 +1158,7 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") - url = f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" + url = f"{self.BASE_URL}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" self.provider_api.put(url, access_token) print(f"Biomaterial linked successfully to dataset: {dataset_id}") @@ -1156,7 +1174,7 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to process {process_id}") - url = f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses" + url = f"{self.BASE_URL}/biomaterials/{biomaterial_id}/inputToProcesses" self.perform_hal_linkage(url, process_id, 'processes', access_token) def delete_submission(self, submission_envelope_id, access_token, force_delete=False): @@ -1171,7 +1189,7 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F Returns: bool: True if the deletion was successful, False otherwise. """ - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}" headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {access_token}' @@ -1201,7 +1219,7 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): 'Authorization': f'Bearer {access_token}' } - response = requests.post(url, headers=headers, data=f"{self.base_url}/{link_to}/{input_id}") + response = requests.post(url, headers=headers, data=f"{self.BASE_URL}/{link_to}/{input_id}") if response.status_code != 200: raise Exception(f"Failed to link biomaterial to process {input_id}. " @@ -1210,7 +1228,7 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): print("Linkage successful") def create_child_biomaterial(self, cell_line_entity_id, body, access_token): - url = f"{self.base_url}/biomaterials/{cell_line_entity_id}/childBiomaterials" + url = f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/childBiomaterials" entity_id = post_to_provider_api_and_get_entity_id(url, body, access_token) return entity_id @@ -1226,10 +1244,10 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces access_token (str): Access token for authorization. """ if type == 'biomaterial': - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/biomaterials/{entity_id}" self.provider_api.put(url, access_token) elif type == 'file': - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/files/{entity_id}" self.provider_api.put(url, access_token) def delete_dataset(self, dataset, access_token): @@ -1240,7 +1258,7 @@ def delete_dataset(self, dataset, access_token): dataset (str): The ID of the dataset to delete. access_token (str): Access token for authorization. """ - fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{dataset}", access_token) + fetched_dataset = self.provider_api.get(f"{self.BASE_URL}/datasets/{dataset}", access_token) print(f"Dataset fetched successfully: {dataset}") print(f"Initiating delete of {dataset}") @@ -1251,17 +1269,17 @@ def delete_dataset(self, dataset, access_token): print("Deleting Biomaterials:") for biomaterial in biomaterials: print(f"Deleting {biomaterial}") - self.provider_api.delete(f"{self.base_url}/biomaterials/{biomaterial}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/biomaterials/{biomaterial}", access_token) print("\nDeleting Processes:") for process in processes: print(f"Deleting {process}") - self.provider_api.delete(f"{self.base_url}/processes/{process}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/processes/{process}", access_token) print("\nDeleting Data Files:") for data_file in data_files: print(f"Deleting {data_file}") - self.provider_api.delete(f"{self.base_url}/files/{data_file}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/files/{data_file}", access_token) print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete(f"{self.base_url}/datasets/{dataset}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 405d9e1..2ac893f 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -7,7 +7,7 @@ import pandas as pd from ait.commons.util.aws_client import Aws from ait.commons.util.command.list import CmdList -from ait.commons.util.command.submit import CmdSubmit, get_id_from_url, create_new_submission_envelope +from ait.commons.util.command.submit import CmdSubmit, get_entity_id_from_hal_link, create_new_submission_envelope from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider @@ -73,7 +73,7 @@ def _create_expression_alterations(submission_instance, class CmdSubmitFile: - BASE_URL = 'https://api.ingest.dev.archive.morphic.bio' + BASE_URL = 'http://localhost:8080' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" @@ -98,7 +98,7 @@ def __init__(self, args): self.dataset = self._get_required_arg('dataset', ( "Dataset is mandatory to be registered before submitting dataset metadata. " "Please submit your study using the submit option, register your dataset using " - "the same option, and link your dataset to your study before proceeding with this submission." + "the submit option, and link your dataset to your study before proceeding with this submission." )) # Validate file argument only if action is not DELETE @@ -183,23 +183,39 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) cell_lines_df = parsed_data['cell_lines_df'] differentiated_cell_lines = parsed_data['differentiated_cell_lines'] differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + undifferentiated_cell_lines = parsed_data['undifferentiated_cell_lines'] + undifferentiated_cell_lines_df = parsed_data['undifferentiated_cell_lines_df'] library_preparations = parsed_data['library_preparations'] library_preparations_df = parsed_data['library_preparations_df'] sequencing_files = parsed_data['sequencing_files'] sequencing_files_df = parsed_data['sequencing_files_df'] + differentiated = parsed_data['differentiated'] + cell_line_sheet_name = parsed_data['cell_line_sheet_name'] + + if differentiated: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'differentiated_cell_line_sheet_name'] + else: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'undifferentiated_cell_line_sheet_name'] # Initialize lists for created entities created_expression_alterations = [] created_cell_lines = [] - created_differentiated_cell_lines = [] + created_differentiated_or_undifferentiated_cell_lines = [] created_library_preparations = [] created_sequencing_files = [] if self._is_add_action(): self._create_submission_envelope() - parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, + parent_cell_line_name) created_expression_alterations = self._handle_expression_alterations( - submission_instance, expression_alterations, expression_alterations_df, parent_cell_line_id + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_name, + parent_cell_line_id ) if cell_lines and cell_lines_df is not None: @@ -207,9 +223,14 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) submission_instance, cell_lines, cell_lines_df, created_expression_alterations) if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines = self._create_differentiated_cell_lines( + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if (undifferentiated_cell_lines and undifferentiated_cell_lines_df is not None + and not differentiated): + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, undifferentiated_cell_lines, undifferentiated_cell_lines_df) + if library_preparations and library_preparations_df is not None: created_library_preparations = self._create_library_preparations( submission_instance, library_preparations, library_preparations_df) @@ -221,17 +242,22 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) updated_dfs, message = self._establish_links(submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + created_differentiated_or_undifferentiated_cell_lines, + differentiated_cell_lines_df if differentiated_cell_lines_df is not None else undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, sequencing_files_df) if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) + self._save_and_upload_results(updated_dfs, + expression_alterations_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name) else: - return self._delete_actions(self.submission_envelope_id, submission_instance, None) + return self._delete_actions(self.submission_envelope_id, + submission_instance, + None) except ValidationError as e: print(f"Validation Error: {e.errors}") # self._delete_actions(self.submission_envelope_id, submission_instance, e) @@ -248,39 +274,52 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) def _handle_parent_cell_line(self, submission_instance, parent_cell_line_name): """Handles the creation of a parent cell line.""" parent_cell_line_id = None + if parent_cell_line_name: print(f"Creating parental cell line with name {parent_cell_line_name}") parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + return parent_cell_line_id def _handle_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df, + parent_cell_line_name, parent_cell_line_id): """Handles the creation of expression alterations and links them to the parent cell line if needed.""" created_expression_alterations = [] + if expression_alterations and expression_alterations_df is not None: created_expression_alterations = self._submit_expression_alterations( submission_instance, expression_alterations, expression_alterations_df ) + if created_expression_alterations and parent_cell_line_id: self._link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + submission_instance, + self.access_token, + parent_cell_line_name, + parent_cell_line_id, + created_expression_alterations ) + return created_expression_alterations def _parse_spreadsheet(self, parser): try: # Determine the necessary sheet names tab_names = parser.list_sheets() + cell_line_sheet_name = next( (name for name in ["Cell line", "Clonal cell line"] if name in tab_names), None ) + differentiated_cell_line_sheet_name = next( (name for name in ["Differentiated cell line", "Differentiated product"] if name in tab_names), None ) + undifferentiated_cell_line_sheet_name = ( "Undifferentiated product" if "Undifferentiated product" in tab_names else None ) @@ -288,10 +327,16 @@ def _parse_spreadsheet(self, parser): undifferentiated_cell_lines = [] undifferentiated_cell_lines_df = None + differentiated_cell_lines = [] + differentiated_cell_lines_df = None + + differentiated = False + # Validate the presence of required sheets if not cell_line_sheet_name: self.validation_errors.append("Spreadsheet must contain a " "'Cell line' or 'Clonal cell line' sheet.") + if not (differentiated_cell_line_sheet_name or undifferentiated_cell_line_sheet_name): self.validation_errors.append( "Spreadsheet must contain a " @@ -303,12 +348,15 @@ def _parse_spreadsheet(self, parser): expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( cell_line_sheet_name, self.action, self.validation_errors ) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - differentiated_cell_line_sheet_name, self.action, self.validation_errors - ) + + if differentiated_cell_line_sheet_name: + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + differentiated_cell_line_sheet_name, self.action, self.validation_errors + ) if undifferentiated_cell_line_sheet_name: undifferentiated_cell_lines, undifferentiated_cell_lines_df = parser.get_undifferentiated_cell_lines( @@ -318,25 +366,36 @@ def _parse_spreadsheet(self, parser): # Check for errors and merge data if differentiated_cell_lines and undifferentiated_cell_lines: self.validation_errors.append( - "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/products" + "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/ products" ) if differentiated_cell_lines: + differentiated = True merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) - if undifferentiated_cell_lines: + + if undifferentiated_cell_lines and not differentiated: merge_cell_line_and_differentiated_cell_line(cell_lines, undifferentiated_cell_lines, self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) - merge_differentiated_cell_line_and_library_preparation( - differentiated_cell_lines, library_preparations, self.validation_errors - ) + + if differentiated_cell_lines: + merge_differentiated_cell_line_and_library_preparation( + differentiated_cell_lines, library_preparations, self.validation_errors + ) + + if undifferentiated_cell_lines and not differentiated: + merge_differentiated_cell_line_and_library_preparation( + undifferentiated_cell_lines, library_preparations, self.validation_errors + ) + sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) # Return the parsed data as a dictionary @@ -354,6 +413,10 @@ def _parse_spreadsheet(self, parser): "library_preparations_df": library_preparations_df, "sequencing_files": sequencing_files, "sequencing_files_df": sequencing_files_df, + "differentiated": differentiated, + "cell_line_sheet_name": cell_line_sheet_name, + "differentiated_cell_line_sheet_name": differentiated_cell_line_sheet_name, + "undifferentiated_cell_line_sheet_name": undifferentiated_cell_line_sheet_name } except Exception: self.validation_errors.append(f"Spreadsheet is invalid {self.file}") @@ -414,7 +477,8 @@ def _create_submission_envelope(self): self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token ) if status_code in (200, 201): - self.submission_envelope_id = get_id_from_url(submission_envelope_response['_links']['self']['href']) + self.submission_envelope_id = get_entity_id_from_hal_link( + submission_envelope_response['_links']['self']['href']) print(f"Submission envelope for this submission is: {self.submission_envelope_id}") else: raise SubmissionError(f"Failed to create submission envelope. Status code: {status_code}") @@ -505,8 +569,8 @@ def _establish_links(self, submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, @@ -516,8 +580,8 @@ def _establish_links(self, updated_dfs, message = submission_instance.establish_links( created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, @@ -531,15 +595,19 @@ def _establish_links(self, return updated_dfs, message - def _save_and_upload_results(self, updated_dfs, expression_alteration_df): + def _save_and_upload_results(self, + updated_dfs, + expression_alteration_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name): """Save the updated dataframes and upload the results.""" current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file = f"submission_result_{current_time}.xlsx" try: # List of updated DataFrames and corresponding sheet names dataframes = [ - (updated_dfs[0], 'Cell line'), - (updated_dfs[1], 'Differentiated cell line'), + (updated_dfs[0], cell_line_sheet_name), + (updated_dfs[1], differentiated_or_undifferentiated_cell_line_sheet_name), (updated_dfs[2], 'Library preparation'), (updated_dfs[3], 'Sequence file'), (expression_alteration_df, 'Expression alteration strategy') @@ -597,10 +665,11 @@ def _handle_modify_action_failure(self, error): def _link_parent_cell_line_expression_alteration(self, submission_instance, access_token, + parent_cell_line_name, parent_cell_line_id, created_expression_alterations): for expression_alteration in created_expression_alterations: - print(f"Linking parent cell line {parent_cell_line_id} " + print(f"Linking parent cell line {parent_cell_line_name} " f"as input to process of {expression_alteration.expression_alteration_id}") submission_instance.perform_hal_linkage( f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses", From b4c1f97c5b342e4268476b2969f88cc0f5d5c550 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 3 Oct 2024 12:26:28 +0100 Subject: [PATCH 49/55] check if valid dataset is provided --- ait/commons/util/command/submit_file.py | 8 ++++++++ ait/commons/util/spreadsheet_util.py | 7 ++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 2ac893f..c50c0de 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -101,6 +101,14 @@ def __init__(self, args): "the submit option, and link your dataset to your study before proceeding with this submission." )) + if self.dataset: + try: + self.provider_api.get(f"{self.BASE_URL}/datasets/{self.dataset}", + self.access_token) + except Exception as e: + print(f"Dataset does not exist {self.dataset}") + sys.exit(1) + # Validate file argument only if action is not DELETE if self.action != 'DELETE': self.file = self._get_required_arg('file', "File is mandatory") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index ff9fec2..3220385 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -590,15 +590,16 @@ def __init__(self, file_path): def list_sheets(self): """ - Retrieves the names of all sheets present in the Excel file. + Retrieves the names of all sheets present in the Excel file, + trimming any leading or trailing spaces. Returns: -------- list - A list of sheet names present in the Excel file. + A list of trimmed sheet names present in the Excel file. """ xls = pd.ExcelFile(self.file_path, engine='openpyxl') - return xls.sheet_names + return [sheet_name.strip() for sheet_name in xls.sheet_names] def input_file_to_data_frames(self, sheet_name, action): if action.upper() == 'MODIFY': From fed85787ffdb45a62523579b59d2480eafb78167 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 8 Oct 2024 09:52:30 +0100 Subject: [PATCH 50/55] upgrade version --- ait/commons/util/command/submit.py | 2 +- ait/commons/util/command/submit_file.py | 2 +- ait/commons/util/settings/morphic_util.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index ad17000..b29d00b 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -191,7 +191,7 @@ class CmdSubmit: transform(file): Transforms the input file to a JSON object. put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ - BASE_URL = 'http://localhost:8080' + BASE_URL = 'https://api.ingest.dev.archive.morphic.bio/' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index c50c0de..f84c361 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -73,7 +73,7 @@ def _create_expression_alterations(submission_instance, class CmdSubmitFile: - BASE_URL = 'http://localhost:8080' + BASE_URL = 'https://api.ingest.dev.archive.morphic.bio/' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 382b391..8d54814 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '1.0.1' +VERSION = '1.0.3' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From b84ccf9d283e0953d287312080681247a8a42879 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 18 Oct 2024 17:01:31 +0100 Subject: [PATCH 51/55] adapt as per v7 of spreadsheet --- ait/commons/util/command/submit.py | 21 +- ait/commons/util/command/submit_file.py | 14 +- ait/commons/util/settings/morphic_util.py | 2 +- ait/commons/util/spreadsheet_util.py | 349 +++++++++++----------- 4 files changed, 201 insertions(+), 185 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index b29d00b..9e3cccd 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -246,7 +246,7 @@ def handle_cell_line(self, if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') + 'clonal_cell_line.label') return cell_line.id else: errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") @@ -259,7 +259,7 @@ def handle_cell_line(self, cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, submission_envelope_id, dataset_id, access_token) update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') + 'clonal_cell_line.label') return cell_line_entity_id except Exception as e: errors.append(f"Failed to create cell line: {cell_line.biomaterial_id}") @@ -352,7 +352,7 @@ def handle_differentiated_cell_line(self, update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') + 'differentiated_product.label') return differentiated_cell_line.id else: errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " @@ -372,7 +372,7 @@ def handle_differentiated_cell_line(self, submission_envelope_id) update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') + 'differentiated_product.label') return differentiated_cell_line_id except Exception as e: errors.append(f"Failed to create differentiated cell line: {differentiated_cell_line.biomaterial_id}") @@ -552,7 +552,7 @@ def handle_library_preparation(self, update_dataframe(library_preparations_df, library_preparation.id, library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') + 'library_preparation.label') return library_preparation.id else: errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " @@ -570,7 +570,7 @@ def handle_library_preparation(self, submission_envelope_id) update_dataframe(library_preparations_df, library_preparation_entity_id, library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') + 'library_preparation.label') return library_preparation_entity_id except Exception as e: @@ -743,7 +743,7 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, update_dataframe(sequencing_file_df, sequencing_file.id, sequencing_file.file_name, - 'sequence_file.file_core.file_name') + 'sequence_file.label') return sequencing_file.id else: errors.append( @@ -761,7 +761,7 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, submission_envelope_id) update_dataframe(sequencing_file_df, sequencing_file_entity_id, sequencing_file.file_name, - 'sequence_file.file_core.file_name') + 'sequence_file.label') return sequencing_file_entity_id except Exception as e: @@ -929,7 +929,7 @@ def establish_links(self, try: for cell_line in cell_lines: for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: - if cell_line.biomaterial_id == differentiated_or_undifferentiated_cell_line.input_biomaterial_id: + if cell_line.biomaterial_id == differentiated_or_undifferentiated_cell_line.cell_line_biomaterial_id: self.link_cell_line_and_differentiated_cell_line(access_token, cell_line, differentiated_or_undifferentiated_cell_line, @@ -939,7 +939,8 @@ def establish_links(self, errors) for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: for library_preparation in library_preparations: - if differentiated_or_undifferentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: + if (differentiated_or_undifferentiated_cell_line.biomaterial_id == + library_preparation.differentiated_biomaterial_id): self.link_differentiated_and_library_preparation( access_token, differentiated_or_undifferentiated_cell_line, diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index f84c361..4670db4 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -65,7 +65,7 @@ def _create_expression_alterations(submission_instance, .astype(object)) expression_alterations_df.loc[ expression_alterations_df[ - 'expression_alteration_id'] == expression_alteration.expression_alteration_id, + 'expression_alteration.label'] == expression_alteration.expression_alteration_id, expression_alterations_entity_id_column_name ] = expression_alteration_id @@ -354,7 +354,7 @@ def _parse_spreadsheet(self, parser): # Parse different sections of the spreadsheet expression_alterations, expression_alterations_df = parser.get_expression_alterations( - 'Expression alteration strategy', self.action, self.validation_errors + 'Expression alteration', self.action, self.validation_errors ) cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( @@ -426,7 +426,7 @@ def _parse_spreadsheet(self, parser): "differentiated_cell_line_sheet_name": differentiated_cell_line_sheet_name, "undifferentiated_cell_line_sheet_name": undifferentiated_cell_line_sheet_name } - except Exception: + except Exception as e: self.validation_errors.append(f"Spreadsheet is invalid {self.file}") return None @@ -443,7 +443,7 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): # Exit now if there are validation errors in the spreadsheet if self.validation_errors: raise ValidationError(self.validation_errors) - except ValidationError as e: + except ValidationError: # Check if the error is related to a missing sheet missing_sheet_errors = [msg for msg in self.validation_errors if "Missing sheet" in msg] @@ -451,13 +451,15 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): # Extract the sheet name(s) from the errors missing_sheets = ', '.join([msg.split("'")[1] for msg in missing_sheet_errors]) # Ask the user whether to proceed + """ user_response = input( f"A required sheet '{missing_sheets}' is missing. Do you want to proceed anyway? (yes/no): ").strip().lower() if user_response == 'yes': print("Proceeding with execution...") else: - print("Execution terminated due to missing required sheet.") - sys.exit(1) + """ + print("Execution terminated due to missing required sheet.") + sys.exit(1) else: # Print the error message # print(f"Validation Error: {e.errors}") diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 8d54814..d75163b 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '1.0.3' +VERSION = '1.0.4' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 3220385..3b0a05e 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -3,6 +3,7 @@ import pandas as pd import json import numpy as np +import json """ class MissingMandatoryFieldError(Exception): @@ -67,20 +68,24 @@ class CellLine: def __init__(self, biomaterial_id, description, - derived_from_accession, + parental_cell_line_name, clone_id, protocol_id, zygosity, cell_type, + treatment_condition, + wt_control_status, expression_alteration_id, id): self.biomaterial_id = biomaterial_id self.description = description - self.derived_from_accession = derived_from_accession + self.parental_cell_line_name = parental_cell_line_name self.clone_id = clone_id self.protocol_id = protocol_id self.zygosity = zygosity self.cell_type = cell_type + self.treatment_condition = treatment_condition + self.wt_control_status = wt_control_status self.differentiated_cell_lines = [] self.expression_alteration_id = expression_alteration_id self.id = id @@ -93,22 +98,28 @@ def __repr__(self): def to_dict(self): content = { - "label": self.biomaterial_id, - "description": self.description, - "derived_from_cell_line": self.derived_from_accession, - "zygosity": self.zygosity, - "type": self.cell_type + "label": self.biomaterial_id, # matches 'label' in schema + "description": self.description, # matches 'description' in schema + "zygosity": self.zygosity, # matches 'zygosity' in schema + "type": self.cell_type, # matches 'type' in schema + "parental_cell_line_name": self.parental_cell_line_name # matches 'parental_cell_line_name' in schema } - # Only add optional/custom fields if they are provided + # Optional fields - add them only if they are provided if self.clone_id: - content["clone_id"] = self.clone_id # Not in schema, custom field + content["clone_id"] = self.clone_id # matches 'clone_id' in schema if self.protocol_id: - content["protocol_id"] = self.protocol_id # Not in schema, custom field + content[ + "cell_line_generation_protocol"] = self.protocol_id # matches 'cell_line_generation_protocol' in schema + + if self.treatment_condition: + content[ + "treatment_condition"] = self.treatment_condition # matches 'cell_line_generation_protocol' in schema - if self.expression_alteration_id: - content["expression_alteration_id"] = self.expression_alteration_id # Not in schema, custom field + if self.wt_control_status: + content[ + "wt_control_status"] = self.wt_control_status # matches 'cell_line_generation_protocol' in schema return { "content": content @@ -118,28 +129,28 @@ def to_dict(self): class ExpressionAlterationStrategy: def __init__(self, expression_alteration_id, - protocol_id, + parent_protocol_id, allele_specific, - altered_gene_symbols, - altered_gene_ids, + altered_gene_symbol, + target_gene_hgnc_id, targeted_genomic_region, expected_alteration_type, - sgrna_target, - protocol_method_text, + editing_strategy, altered_locus, guide_sequence, + method, id): self.expression_alteration_id = expression_alteration_id - self.protocol_id = protocol_id + self.parent_protocol_id = parent_protocol_id self.allele_specific = allele_specific - self.altered_gene_symbols = altered_gene_symbols - self.altered_gene_ids = altered_gene_ids + self.altered_gene_symbol = altered_gene_symbol + self.target_gene_hgnc_id = target_gene_hgnc_id self.targeted_genomic_region = targeted_genomic_region self.expected_alteration_type = expected_alteration_type - self.sgrna_target = sgrna_target - self.protocol_method_text = protocol_method_text + self.editing_strategy = editing_strategy self.altered_locus = altered_locus self.guide_sequence = guide_sequence + self.method = method self.id = id def __repr__(self): @@ -148,43 +159,50 @@ def __repr__(self): def to_dict(self): return { "content": { - "expression_alteration_label": self.expression_alteration_id, - "protocol_id": self.protocol_id, - "allele_specific": self.allele_specific, - "altered_gene_symbols": self.altered_gene_symbols, - "altered_gene_ids": self.altered_gene_ids, - "targeted_genomic_region": self.targeted_genomic_region, - "expected_alteration_type": self.expected_alteration_type, - "sgrna_target": self.sgrna_target, - "protocol_method_text": self.protocol_method_text, - "altered_locus": self.altered_locus, - "guide_sequence": self.guide_sequence, - "id": self.id + "expression_alteration_id": self.expression_alteration_id, + "parent_protocol_id": self.parent_protocol_id, + "genes": [ + { + "allele_specific": self.allele_specific, + "altered_gene_symbol": self.altered_gene_symbol, + "target_gene_hgnc_id": self.target_gene_hgnc_id, + "targeted_genomic_region": self.targeted_genomic_region, + "expected_alteration_type": self.expected_alteration_type, + "editing_strategy": self.editing_strategy, + "altered_locus": self.altered_locus, + "guide_sequence": self.guide_sequence + } + ], + "method": self.method, } } class DifferentiatedCellLine: def __init__(self, - biomaterial_id, + biomaterial_id, # Maps to 'label' description, - input_biomaterial_id, - protocol_id, - timepoint_value, - timepoint_unit, + cell_line_biomaterial_id, # Maps to 'clonal_cell_line_label' + differentiated_product_protocol_id, terminally_differentiated, model_system, - id): - self.biomaterial_id = biomaterial_id + timepoint_value, + timepoint_unit, + treatment_condition=None, # New field as per schema + wt_control_status=None, # New field as per schema + id=None): # Optional, custom field + self.biomaterial_id = biomaterial_id # This maps to 'label' in the schema self.description = description - self.input_biomaterial_id = input_biomaterial_id - self.protocol_id = protocol_id - self.timepoint_value = timepoint_value - self.timepoint_unit = timepoint_unit + self.cell_line_biomaterial_id = cell_line_biomaterial_id # Maps to 'clonal_cell_line_label' + self.differentiated_product_protocol_id = differentiated_product_protocol_id self.terminally_differentiated = terminally_differentiated self.model_system = model_system + self.timepoint_value = timepoint_value + self.timepoint_unit = timepoint_unit + self.treatment_condition = treatment_condition # Added to match schema + self.wt_control_status = wt_control_status # Added to match schema self.library_preparations = [] - self.id = id + self.id = id # Custom field not in the schema def add_library_preparation(self, library_preparation): self.library_preparations.append(library_preparation) @@ -196,18 +214,20 @@ def to_dict(self): content = { "label": self.biomaterial_id, "description": self.description, + "clonal_cell_line_id": self.cell_line_biomaterial_id, + "differentiated_product_protocol_id": self.differentiated_product_protocol_id, + "terminally_differentiated": self.terminally_differentiated, + "model_system": self.model_system, "timepoint_value": self.timepoint_value, "timepoint_unit": self.timepoint_unit, - "terminally_differentiated": self.terminally_differentiated, - "model_system": self.model_system } - # Only add optional/custom fields if they are provided - if self.input_biomaterial_id: - content["input_biomaterial_id"] = self.input_biomaterial_id # Not in schema, custom field + # Add optional fields only if they are provided + if self.treatment_condition: + content["treatment_condition"] = self.treatment_condition - if self.protocol_id: - content["protocol_id"] = self.protocol_id # Not in schema, custom field + if self.wt_control_status: + content["wt_control_status"] = self.wt_control_status return { "content": content @@ -218,7 +238,6 @@ class LibraryPreparation: def __init__(self, biomaterial_id, protocol_id, - dissociation_protocol_id, differentiated_biomaterial_id, average_fragment_size, input_amount_value, @@ -232,7 +251,6 @@ def __init__(self, id): self.biomaterial_id = biomaterial_id self.protocol_id = protocol_id - self.dissociation_protocol_id = dissociation_protocol_id self.differentiated_biomaterial_id = differentiated_biomaterial_id self.average_fragment_size = average_fragment_size self.input_amount_value = input_amount_value @@ -253,7 +271,7 @@ def __repr__(self): return json.dumps(self.to_dict(), indent=2) def to_dict(self): - # Helper function to handle invalid JSON values + # Helper function to handle invalid JSON values (e.g., NaN, infinite) def convert_to_valid_json_value(value): if isinstance(value, float) and (np.isnan(value) or not np.isfinite(value)): return None @@ -261,6 +279,7 @@ def convert_to_valid_json_value(value): content = { "label": self.biomaterial_id, + "library_preparation_protocol_id": self.protocol_id, "average_fragment_size": convert_to_valid_json_value(self.average_fragment_size), "input_amount_value": convert_to_valid_json_value(self.input_amount_value), "input_amount_unit": self.input_amount_unit, @@ -273,12 +292,8 @@ def convert_to_valid_json_value(value): } # Add optional/custom fields if they are provided - if self.protocol_id: - content["protocol_id"] = self.protocol_id # Not in schema, custom field - if self.dissociation_protocol_id: - content["dissociation_protocol_id"] = self.dissociation_protocol_id # Not in schema, custom field if self.differentiated_biomaterial_id: - content["differentiated_biomaterial_id"] = self.differentiated_biomaterial_id # Not in schema, custom field + content["differentiated_biomaterial_id"] = self.differentiated_biomaterial_id return { "content": content @@ -506,7 +521,7 @@ def merge_cell_line_and_differentiated_cell_line(cell_lines, source_entities=cell_lines, target_entities=differentiated_cell_lines, source_attr="biomaterial_id", - target_attr="input_biomaterial_id", + target_attr="cell_line_biomaterial_id", source_type="Cell line", target_type="Differentiated Cell line", errors=errors @@ -516,15 +531,15 @@ def merge_cell_line_and_differentiated_cell_line(cell_lines, cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.input_biomaterial_id not in cell_line_ids: + if differentiated_cell_line.cell_line_biomaterial_id not in cell_line_ids: missing_parent_entity_error.add_error("Cell Line", "Differentiated Cell line", - differentiated_cell_line.biomaterial_id, + differentiated_cell_line.label, errors) for cell_line in cell_lines: for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.input_biomaterial_id == cell_line.biomaterial_id: + if differentiated_cell_line.cell_line_biomaterial_id == cell_line.biomaterial_id: cell_line.add_differentiated_cell_line(differentiated_cell_line) @@ -648,18 +663,18 @@ def parse_cell_lines(self, parent_cell_line_names = [] # Check if the required column exists - if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: + if 'clonal_cell_line.label' not in df.columns: errors.append( - f"The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the {sheet_name} sheet. " + f"The column 'clonal_cell_line.label' does not exist in the {sheet_name} sheet. " f"The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null - df = df[df['cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df[df['clonal_cell_line.label'].notna()] # Replace invalid float values with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for invalid starting values - cols_to_check = ['cell_line.biomaterial_core.biomaterial_id'] + cols_to_check = ['clonal_cell_line.label'] invalid_start_values = ( 'FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', 'cell_line.biomaterial_core.biomaterial_id' @@ -668,7 +683,7 @@ def parse_cell_lines(self, mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith(invalid_start_values)).all(axis=1) df_filtered = df[mask] # Check for a unique value in 'cell_line.derived_cell_line_accession' - derived_col = 'cell_line.derived_cell_line_accession' + derived_col = 'clonal_cell_line.parental_cell_line_name' if derived_col in df_filtered.columns: parent_cell_line_names = df_filtered[derived_col].dropna().unique() @@ -683,30 +698,32 @@ def parse_cell_lines(self, cell_lines = [] for _, row in df_filtered.iterrows(): - biomaterial_id = row['cell_line.biomaterial_core.biomaterial_id'] - derived_from_accession = row.get('cell_line.derived_cell_line_accession') - cell_type = row.get('cell_line.type') - expression_alteration_id = row.get('expression_alteration_id') + label = row['clonal_cell_line.label'] + parental_cell_line_name = row.get('clonal_cell_line.parental_cell_line_name') + cell_type = row.get('clonal_cell_line.type') + expression_alteration_id = row.get('expression_alteration.label') # Error handling for missing mandatory fields - if pd.isnull(biomaterial_id): - errors.append("Biomaterial ID cannot be null in any row of the Cell line sheet.") + if pd.isnull(label): + errors.append("Biomaterial ID cannot be null in any row of the Cell line/ Clonal cell line sheet.") - if any(pd.isnull(field) for field in [derived_from_accession, cell_type]): + if any(pd.isnull(field) for field in [parental_cell_line_name, cell_type]): errors.append( - f"Mandatory fields (derived_accession, cell_type, expression_alteration_id) are required for Cell " - f"line entity: {biomaterial_id}") + f"Mandatory fields (parental_cell_line_name, clonal_cell_line.type, expression_alteration.label) are required for Cell " + f"line/ Clonal cell line entity: {label}") cell_lines.append( CellLine( - biomaterial_id=biomaterial_id, - description=row.get('cell_line.biomaterial_core.biomaterial_description'), - derived_from_accession=derived_from_accession, - clone_id=row.get('cell_line.clone_id'), - protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), - zygosity=row.get('cell_line.zygosity'), + biomaterial_id=label, + description=row.get('clonal_cell_line.description'), + parental_cell_line_name=parental_cell_line_name, + clone_id=row.get('clonal_cell_line.clone_id'), + protocol_id=row.get('clonal_cell_line.cell_line_generation_protocol'), + zygosity=row.get('clonal_cell_line.zygosity'), cell_type=cell_type, expression_alteration_id=expression_alteration_id, + wt_control_status=row.get('clonal_cell_line.wt_control_status'), + treatment_condition=row.get('clonal_cell_line.treatment_condition'), id=row.get('Id') ) ) @@ -739,16 +756,16 @@ def parse_differentiated_cell_lines(self, # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists - if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: - errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + if 'differentiated_product.label' not in df.columns: + errors.append(f"The column 'differentiated_product.label' does not " f"exist in {sheet_name} name. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null - df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df[df['differentiated_product.label'].notna()] df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' - cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] + cols_to_check = ['differentiated_product.label'] # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', @@ -759,19 +776,19 @@ def parse_differentiated_cell_lines(self, differentiated_cell_lines = [] for _, row in df_filtered.iterrows(): - differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] - biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') + label = row['differentiated_product.label'] + parent_biomaterial_id = row.get('clonal_cell_line.label') # Check if biomaterial_id is null - if pd.isnull(differentiated_biomaterial_id): + if pd.isnull(label): errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " "sheet.") # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") # Check if derived_accession and cell_type are present - if pd.isnull(biomaterial_id): + if pd.isnull(parent_biomaterial_id): errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " - f"{differentiated_biomaterial_id}") + f"{label}") """ raise MissingMandatoryFieldError( "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) @@ -780,20 +797,24 @@ def parse_differentiated_cell_lines(self, # Create DifferentiatedCellLine objects from filtered DataFrame rows differentiated_cell_lines.append( DifferentiatedCellLine( - biomaterial_id=differentiated_biomaterial_id, - description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), - input_biomaterial_id=biomaterial_id, - protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), - timepoint_value=row.get('differentiated_cell_line.timepoint_value'), - timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), - terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), - model_system=row.get('differentiated_cell_line.model_organ.text'), + biomaterial_id=label, + description=row.get('differentiated_product.biomaterial_core.biomaterial_description'), + cell_line_biomaterial_id=parent_biomaterial_id, + differentiated_product_protocol_id=row.get( + 'differentiated_product.differentiated_product_protocol_id'), + treatment_condition=row.get('differentiated_product.treatment_condition'), + wt_control_status=row.get('differentiated_product.wt_control_status'), + timepoint_value=row.get('differentiated_product.timepoint_value'), + timepoint_unit=row.get('differentiated_product.timepoint_unit'), + terminally_differentiated=row.get('differentiated_product.terminally_differentiated'), + model_system=row.get('differentiated_product.model_system'), id=row.get('Id') ) ) return differentiated_cell_lines, df_filtered + # TODO: review def parse_undifferentiated_cell_lines(self, sheet_name, action, @@ -820,16 +841,16 @@ def parse_undifferentiated_cell_lines(self, # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists - if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: - errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - f"exist in {sheet_name}. The rest of the file will not be processed") + if 'differentiated_product.label' not in df.columns: + errors.append(f"The column 'differentiated_product.label' does not " + f"exist in {sheet_name} name. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null - df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df[df['differentiated_product.label'].notna()] df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' - cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] + cols_to_check = ['differentiated_product.label'] # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', @@ -840,19 +861,19 @@ def parse_undifferentiated_cell_lines(self, undifferentiated_cell_lines = [] for _, row in df_filtered.iterrows(): - differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] - biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') + label = row['differentiated_product.label'] + parent_biomaterial_id = row.get('differentiated_product.differentiated_product_protocol_id') # Check if biomaterial_id is null - if pd.isnull(differentiated_biomaterial_id): + if pd.isnull(label): errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " "sheet.") # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") # Check if derived_accession and cell_type are present - if pd.isnull(biomaterial_id): + if pd.isnull(parent_biomaterial_id): errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " - f"{differentiated_biomaterial_id}") + f"{label}") """ raise MissingMandatoryFieldError( "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) @@ -861,14 +882,17 @@ def parse_undifferentiated_cell_lines(self, # Create DifferentiatedCellLine objects from filtered DataFrame rows undifferentiated_cell_lines.append( DifferentiatedCellLine( - biomaterial_id=differentiated_biomaterial_id, - description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), - input_biomaterial_id=biomaterial_id, - protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), - timepoint_value=row.get('differentiated_cell_line.timepoint_value'), - timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), - terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), - model_system=row.get('differentiated_cell_line.model_organ.text'), + biomaterial_id=label, + description=row.get('differentiated_product.biomaterial_core.biomaterial_description'), + cell_line_biomaterial_id=parent_biomaterial_id, + differentiated_product_protocol_id=row.get( + 'differentiated_product.differentiated_product_protocol_id'), + treatment_condition=row.get('differentiated_product.treatment_condition'), + wt_control_status=row.get('differentiated_product.wt_control_status'), + timepoint_value=row.get('differentiated_product.timepoint_value'), + timepoint_unit=row.get('differentiated_product.timepoint_unit'), + terminally_differentiated=row.get('differentiated_product.terminally_differentiated'), + model_system=row.get('differentiated_product.model_system'), id=row.get('Id') ) ) @@ -899,10 +923,9 @@ def parse_library_preparations(self, # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists required_columns = [ - 'library_preparation.biomaterial_core.biomaterial_id', - 'dissociation_protocol.protocol_core.protocol_id', - 'differentiated_cell_line.biomaterial_core.biomaterial_id', - 'library_preparation_protocol.protocol_core.protocol_id' + 'library_preparation.label', + 'differentiated_product.label', + 'library_preparation.library_preparation_protocol_id' ] for col in required_columns: @@ -913,10 +936,10 @@ def parse_library_preparations(self, return [], df # Filter rows where biomaterial_id is not null - df = df[df['library_preparation.biomaterial_core.biomaterial_id'].notna()] + df = df[df['library_preparation.label'].notna()] df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' - cols_to_check = ['library_preparation.biomaterial_core.biomaterial_id'] + cols_to_check = ['library_preparation.label'] # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', @@ -927,19 +950,15 @@ def parse_library_preparations(self, library_preparations = [] for _, row in df_filtered.iterrows(): - library_preparation_id = row['library_preparation.biomaterial_core.biomaterial_id'] - dissociation_protocol_id = row.get('dissociation_protocol.protocol_core.protocol_id') - differentiated_biomaterial_id = row.get('differentiated_cell_line.biomaterial_core.biomaterial_id') - library_preparation_protocol_id = row.get('library_preparation_protocol.protocol_core.protocol_id') + label = row['library_preparation.label'] + differentiated_biomaterial_label = row.get('differentiated_product.label') + library_preparation_protocol_id = row.get('library_preparation.library_preparation_protocol_id') # Check if required fields are null - if pd.isnull(library_preparation_id): + if pd.isnull(label): errors.append("Library Preparation ID cannot be null in any row of the Library Preparation sheet.") # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") - if pd.isnull(dissociation_protocol_id): - errors.append("Dissociation Protocol ID cannot be null in any row of the Library Preparation sheet.") - # raise MissingMandatoryFieldError("Dissociation Protocol ID cannot be null in any row.") - if pd.isnull(differentiated_biomaterial_id): + if pd.isnull(differentiated_biomaterial_label): errors.append("Differentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") if pd.isnull(library_preparation_protocol_id): @@ -950,10 +969,9 @@ def parse_library_preparations(self, # Create LibraryPreparation objects from filtered DataFrame rows library_preparations.append( LibraryPreparation( - biomaterial_id=library_preparation_id, + biomaterial_id=label, protocol_id=library_preparation_protocol_id, - dissociation_protocol_id=dissociation_protocol_id, - differentiated_biomaterial_id=differentiated_biomaterial_id, + differentiated_biomaterial_id=differentiated_biomaterial_label, average_fragment_size=row.get('library_preparation.average_fragment_size'), input_amount_value=row.get('library_preparation.input_amount_value'), input_amount_unit=row.get('library_preparation.input_amount_unit'), @@ -995,9 +1013,9 @@ def parse_sequencing_files(self, # Check if the required column exists required_columns = [ - 'sequence_file.file_core.file_name', - 'library_preparation.biomaterial_core.biomaterial_id', - 'sequencing_protocol.protocol_core.protocol_id', + 'sequence_file.label', + 'library_preparation.label', + 'sequence_file.extension', 'sequence_file.read_index' ] @@ -1009,15 +1027,15 @@ def parse_sequencing_files(self, return [], df # Filter rows where file_name is not null - df = df[df['sequence_file.file_core.file_name'].notna()] + df = df[df['sequence_file.label'].notna()] df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' - cols_to_check = ['sequence_file.file_core.file_name'] + cols_to_check = ['sequence_file.label'] # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'The name of the file.', 'Include the file extension in the file name. For example: R1.fastq.gz; codebook.json', - 'sequence_file.file_core.file_name'))).all(axis=1) + 'sequence_file.label'))).all(axis=1) # Apply the mask to filter out rows df_filtered = df[mask] @@ -1025,9 +1043,8 @@ def parse_sequencing_files(self, sequencing_files = [] for _, row in df_filtered.iterrows(): - file_name = row['sequence_file.file_core.file_name'] - library_preparation_id = row.get('library_preparation.biomaterial_core.biomaterial_id') - sequencing_protocol_id = row.get('sequencing_protocol.protocol_core.protocol_id') + file_name = row['sequence_file.label'] + library_preparation_id = row.get('library_preparation.label') read_index = row.get('sequence_file.read_index') # Check if required fields are null @@ -1037,9 +1054,6 @@ def parse_sequencing_files(self, if pd.isnull(library_preparation_id): errors.append("Library Preparation ID cannot be null in any row of the Sequencing File sheet..") # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") - if pd.isnull(sequencing_protocol_id): - errors.append("Sequencing Protocol ID cannot be null in any row of the Sequencing File sheet..") - # raise MissingMandatoryFieldError("Sequencing Protocol ID cannot be null in any row.") if pd.isnull(read_index): errors.append("Read Index cannot be null in any row of the Sequencing File sheet..") # raise MissingMandatoryFieldError("Read Index cannot be null in any row.") @@ -1054,7 +1068,6 @@ def parse_sequencing_files(self, read_length=None, checksum=None, library_preparation_id=library_preparation_id, - sequencing_protocol_id=sequencing_protocol_id, run_id=row.get('sequence_file.run_id'), id=row.get('Id') ) @@ -1091,22 +1104,22 @@ def parse_expression_alteration(self, df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) except Exception as e: errors.append(f"Missing sheet '{sheet_name}': {e}") - return [], None + return [], None, False # Strip whitespace from column names df.columns = df.columns.str.strip() # Check if the required column exists - required_columns = ['expression_alteration_id'] + required_columns = ['expression_alteration.label'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: errors.append( f"The following required columns are missing in the Expression Alteration Strategy sheet: {', '.join(missing_columns)}") - return None, df, False # Return if required columns are missing + return [], df, False # Return if required columns are missing - # Filter rows where 'expression_alteration_id' is not null - df = df[df['expression_alteration_id'].notna()] + # Filter rows where 'expression_alteration.label' is not null + df = df[df['expression_alteration.label'].notna()] # Replace invalid float values (e.g., NaN, infinite) with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) @@ -1118,7 +1131,7 @@ def parse_expression_alteration(self, ) # Create a mask to filter out rows with unwanted starting values - mask = df['expression_alteration_id'].astype(str).str.startswith(unwanted_patterns) + mask = df['expression_alteration.label'].astype(str).str.startswith(unwanted_patterns) df_filtered = df[~mask] # Initialize the list of ExpressionAlterationStrategy objects @@ -1127,17 +1140,17 @@ def parse_expression_alteration(self, for _, row in df_filtered.iterrows(): expression_alterations.append( ExpressionAlterationStrategy( - expression_alteration_id=row.get('expression_alteration_id'), - protocol_id=row.get('gene_expression_alteration_protocol.protocol_core.protocol_id'), - allele_specific=row.get('gene_expression_alteration_protocol.allele_specific'), - altered_gene_symbols=row.get('gene_expression_alteration_protocol.altered_gene_symbols'), - altered_gene_ids=row.get('gene_expression_alteration_protocol.altered_gene_ids'), - targeted_genomic_region=row.get('gene_expression_alteration_protocol.targeted_genomic_region'), - expected_alteration_type=row.get('gene_expression_alteration_protocol.expected_alteration_type'), - sgrna_target=row.get('gene_expression_alteration_protocol.crispr.sgrna_target'), - protocol_method_text=row.get('gene_expression_alteration_protocol.method.text'), - altered_locus=None, # Placeholder if required - guide_sequence=None, # Placeholder if required + expression_alteration_id=row.get('expression_alteration.label'), + parent_protocol_id=row.get('expression_alteration.parent_protocol_id'), + allele_specific=row.get('expression_alteration.genes.allele_specific'), + altered_gene_symbol=row.get('expression_alteration.genes.altered_gene_symbol'), + target_gene_hgnc_id=row.get('expression_alteration.genes.target_gene_hgnc_id'), + targeted_genomic_region=row.get('expression_alteration.genes.targeted_genomic_region'), + expected_alteration_type=row.get('expression_alteration.genes.expected_alteration_type'), + editing_strategy=row.get('expression_alteration.genes.editing_strategy'), + altered_locus=row.get('expression_alteration.genes.altered_locus'), # No longer a placeholder + guide_sequence=row.get('expression_alteration.genes.guide_sequence'), # No longer a placeholder + method=row.get('expression_alteration.method'), id=row.get('Id') ) ) From 18b808ab41702520e25cc7f0843fdedce682544f Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 8 Nov 2024 16:34:12 +0000 Subject: [PATCH 52/55] prod --- ait/commons/util/command/submit.py | 2 +- ait/commons/util/command/submit_file.py | 8 ++++---- ait/commons/util/settings/morphic_util.py | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 9e3cccd..8dc7130 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -191,7 +191,7 @@ class CmdSubmit: transform(file): Transforms the input file to a JSON object. put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ - BASE_URL = 'https://api.ingest.dev.archive.morphic.bio/' + BASE_URL = 'https://api.ingest.archive.morphic.bio/' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 4670db4..90c129f 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -73,7 +73,7 @@ def _create_expression_alterations(submission_instance, class CmdSubmitFile: - BASE_URL = 'https://api.ingest.dev.archive.morphic.bio/' + BASE_URL = 'https://api.ingest.archive.morphic.bio/' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" @@ -427,18 +427,18 @@ def _parse_spreadsheet(self, parser): "undifferentiated_cell_line_sheet_name": undifferentiated_cell_line_sheet_name } except Exception as e: + print(f"Exception occurred:", e) + self.validation_errors.append(f"Spreadsheet is invalid {self.file}") return None def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): - """ # Validate the parsed data and upload the file. validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) - """ """ Handle validation errors, including interacting with the user in case of a missing sheet. - """ + """ try: # Exit now if there are validation errors in the spreadsheet if self.validation_errors: diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index d75163b..0683ef4 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '1.0.4' +VERSION = '1.0.5-PROD' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' @@ -32,11 +32,11 @@ LOCAL_STATE_FILE = USER_HOME + '/.hca-util' # Cognito and IAM -COGNITO_MORPHIC_UTIL_ADMIN = 'morphic-dev-admin' -COGNITO_CLIENT_ID = '1rfis94rvnden5elmocospd256' -COGNITO_IDENTITY_POOL_ID = 'eu-west-2:d6531e9c-020d-4ee8-bf3b-255393c500e9' -COGNITO_USER_POOL_ID = 'eu-west-2_Aqtqtg7u7' -IAM_USER = 'morphic-dev-admin' +COGNITO_MORPHIC_UTIL_ADMIN = 'morphic-admin' +COGNITO_CLIENT_ID = '6poq2i04qt3pj5rkpg51patcrk' +COGNITO_IDENTITY_POOL_ID = 'eu-west-2:87ba188b-51fc-42e0-9172-a1a01cda8ed0' +COGNITO_USER_POOL_ID = 'eu-west-2_2BpGQDRSU' +IAM_USER = 'morphic-admin' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' AWS_SECRET_NAME_SK_BUCKET = 'SK-bucket' From 010f7d019be7b0da18ff97490706d34eec01cb0a Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 2 Dec 2024 14:28:30 +0000 Subject: [PATCH 53/55] md5 sums computation while listing files --- ait/commons/util/__main__.py | 6 +- ait/commons/util/command/list.py | 96 ++++++++++++++++++++++--- ait/commons/util/command/submit.py | 4 +- ait/commons/util/command/submit_file.py | 4 +- ait/commons/util/command/view.py | 4 +- ait/commons/util/provider_api_util.py | 2 +- 6 files changed, 98 insertions(+), 18 deletions(-) diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 987197b..efa2d21 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -113,7 +113,8 @@ def parse_args(args): # parser_clear.add_argument('-a', action='store_true', help='clear all - selection and known dirs') parser_list = cmd_parser.add_parser('list', help='list contents of the area') - parser_list.add_argument('-b', action='store_true', help='list all areas in the S3 bucket (authorised users only)') + parser_list.add_argument('-processing', action='store_true', help='access the processed data (authorised users ' + 'only)') # parser_upload = cmd_parser.add_parser('upload', help='upload files to the area') # group_upload = parser_upload.add_mutually_exclusive_group(required=True) @@ -143,7 +144,8 @@ def parse_args(args): group_delete.add_argument('-d', action='store_true', help='delete upload area and contents (authorised users only)') parser_sync = cmd_parser.add_parser('sync', - help='copy data from selected upload area to ingest upload area (authorised users only)') + help='copy data from selected upload area to ingest upload area (authorised ' + 'users only)') parser_sync.add_argument('INGEST_UPLOAD_AREA', help='Ingest upload area', type=valid_ingest_upload_area) ps = [parser] diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index ef5261e..cf79917 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -1,5 +1,10 @@ +import hashlib +import csv + from ait.commons.util.common import format_err from ait.commons.util.local_state import get_selected_area +from ait.commons.util.user_profile import get_profile +from urllib.parse import urlparse def print_area(k, area): @@ -20,6 +25,34 @@ def print_area(k, area): print() +def get_s3_path(): + while True: + s3_path = input("Enter the S3 path (e.g., s3://bucket-name/folder/): ").strip() + parsed_url = urlparse(s3_path) + + if parsed_url.scheme == 's3' and parsed_url.netloc: + return s3_path + else: + print("Invalid S3 path. Please enter a valid S3 path starting with 's3://'.") + + +def calculate_md5(s3_client, bucket_name, key): + md5_hash = hashlib.md5() + + try: + # Stream the object in chunks + response = s3_client.get_object(Bucket=bucket_name, Key=key) + + for chunk in response['Body'].iter_chunks(chunk_size=8192): + md5_hash.update(chunk) + + return md5_hash.hexdigest() + except Exception as e: + print(f"Failed to compute MD5 for {key}: {e}") + + return None + + class CmdList: """ admin and user @@ -29,22 +62,67 @@ class CmdList: def __init__(self, aws, args): self.aws = aws self.args = args + self.user = get_profile('morphic-util').username + self.processing = getattr(self.args, 'processing', None) self.s3_cli = self.aws.common_session.client('s3') def run(self): - selected_area = get_selected_area() # select area is a S3 bucket + if self.processing: + if self.user != 'morphic-admin': + return False, "Admin function only" + else: + print("Access granted") + + s3_path = get_s3_path() + self.list_s3_files(s3_path) + + return True, None + + else: + selected_area = get_selected_area() # select area is a S3 bucket + + if not selected_area: + return False, 'No area selected' + + try: + self.list_bucket_contents(selected_area) + # print_count(folder_count + files_count) + return True, None + + except Exception as e: + return False, format_err(e, 'list') + + def list_s3_files(self, s3_path): + parsed_url = urlparse(s3_path) + bucket_name = parsed_url.netloc + prefix = parsed_url.path.lstrip('/') + output_file = 's3_file_md5s.tsv' + + with open(output_file, 'w', newline='') as csvfile: + tsv_writer = csv.writer(csvfile, delimiter=',') + tsv_writer.writerow(['File Name', 'MD5 Hash']) # Write header row + + try: + response = self.s3_cli.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + + if 'Contents' in response: + print(f"\nFiles in '{s3_path}'") - if not selected_area: - return False, 'No area selected' + for obj in response['Contents']: + file_key = obj['Key'] + if not file_key.endswith('/'): # Skip folders + md5_hash = calculate_md5(self.s3_cli, bucket_name, file_key) - try: - self.list_bucket_contents(selected_area) - # print_count(folder_count + files_count) - return True, None + if md5_hash: + print(f"{file_key} - MD5: {md5_hash}") + tsv_writer.writerow([file_key, md5_hash]) # Write to file + else: + print("\nNo files found.") + except Exception as e: + print(f"\nError: {e}") - except Exception as e: - return False, format_err(e, 'list') + print(f"\nResults saved to {output_file}") def list_bucket_contents(self, selected_area, prefix=''): result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 8dc7130..678a6be 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -9,7 +9,7 @@ from ait.commons.util.spreadsheet_util import SubmissionError from ait.commons.util.user_profile import get_profile -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi def matching_expression_alteration_and_cell_line(cell_line, expression_alteration): @@ -206,7 +206,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.type = getattr(self.args, 'type', None) self.file = getattr(self.args, 'file', None) - self.provider_api = APIProvider(self.BASE_URL) + self.provider_api = ProviderApi(self.BASE_URL) def run(self): """ diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 90c129f..0eb2e35 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -10,7 +10,7 @@ from ait.commons.util.command.submit import CmdSubmit, get_entity_id_from_hal_link, create_new_submission_envelope from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ merge_differentiated_cell_line_and_library_preparation, SubmissionError @@ -88,7 +88,7 @@ def __init__(self, args): self.user_profile = get_profile('morphic-util') self.access_token = self.user_profile.access_token self.aws = Aws(self.user_profile) - self.provider_api = APIProvider(self.BASE_URL) + self.provider_api = ProviderApi(self.BASE_URL) self.validation_errors = [] self.submission_errors = [] self.submission_envelope_id = None diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py index aa8fc74..3ba3533 100644 --- a/ait/commons/util/command/view.py +++ b/ait/commons/util/command/view.py @@ -1,5 +1,5 @@ from ait.commons.util.aws_client import Aws -from ait.commons.util.provider_api_util import APIProvider +from ait.commons.util.provider_api_util import ProviderApi from ait.commons.util.user_profile import get_profile @@ -10,7 +10,7 @@ def __init__(self, args): self.args = args self.access_token = get_profile('morphic-util').access_token self.user_profile = get_profile('morphic-util') - self.provider_api = APIProvider(self.base_url) + self.provider_api = ProviderApi(self.base_url) if hasattr(self.args, 'dataset') and self.args.dataset is not None: self.dataset = self.args.dataset diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py index 851b052..24a15fe 100644 --- a/ait/commons/util/provider_api_util.py +++ b/ait/commons/util/provider_api_util.py @@ -1,7 +1,7 @@ import requests -class APIProvider: +class ProviderApi: def __init__(self, base_url): self.base_url = base_url From fa13fd04ac400e2937b9cdeaa3f488f3398c56b4 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 9 Dec 2024 09:47:04 +0000 Subject: [PATCH 54/55] don't delete the dataset object --- ait/commons/util/command/submit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 678a6be..e1c972d 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -1282,5 +1282,5 @@ def delete_dataset(self, dataset, access_token): print(f"Deleting {data_file}") self.provider_api.delete(f"{self.BASE_URL}/files/{data_file}", access_token) - print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) + # print(f"\nDeleting the dataset: {dataset}") + # self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) From f789643415c356e8f6b8b0c30115d31146829a7d Mon Sep 17 00:00:00 2001 From: dgupta Date: Wed, 19 Mar 2025 12:07:34 +0000 Subject: [PATCH 55/55] prod recording related changes --- ait/commons/util/command/submit.py | 28 +++-- ait/commons/util/command/submit_file.py | 80 ++++++++------ ait/commons/util/spreadsheet_util.py | 135 +++++++++++++++--------- 3 files changed, 151 insertions(+), 92 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index e1c972d..694cec2 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -323,6 +323,7 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, differentiated_cell_lines_df, + differentiated, submission_envelope_id, dataset_id, access_token, @@ -350,9 +351,15 @@ def handle_differentiated_cell_line(self, print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, - differentiated_cell_line.biomaterial_id, - 'differentiated_product.label') + if differentiated: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'differentiated_product.label') + else: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'undifferentiated_product.label') + return differentiated_cell_line.id else: errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " @@ -370,12 +377,19 @@ def handle_differentiated_cell_line(self, dataset_id, differentiated_cell_line, submission_envelope_id) - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, - differentiated_cell_line.biomaterial_id, - 'differentiated_product.label') + + if differentiated: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'differentiated_product.label') + else: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'undifferentiated_product.label') return differentiated_cell_line_id except Exception as e: - errors.append(f"Failed to create differentiated cell line: {differentiated_cell_line.biomaterial_id}") + errors.append( + f"Failed to create differentiated/undifferentiated cell line: {differentiated_cell_line.biomaterial_id}") raise SubmissionError(errors, e) def create_differentiated_cell_line_entity(self, diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 0eb2e35..dfb8e40 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -186,7 +186,7 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) # Extract parsed data expression_alterations = parsed_data['expression_alterations'] expression_alterations_df = parsed_data['expression_alterations_df'] - parent_cell_line_name = parsed_data['parent_cell_line_name'] + parent_cell_line_names = parsed_data['parent_cell_line_names'] cell_lines = parsed_data['cell_lines'] cell_lines_df = parsed_data['cell_lines_df'] differentiated_cell_lines = parsed_data['differentiated_cell_lines'] @@ -216,28 +216,28 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) if self._is_add_action(): self._create_submission_envelope() - parent_cell_line_id = self._handle_parent_cell_line(submission_instance, - parent_cell_line_name) - created_expression_alterations = self._handle_expression_alterations( - submission_instance, - expression_alterations, - expression_alterations_df, - parent_cell_line_name, - parent_cell_line_id - ) if cell_lines and cell_lines_df is not None: + if self._is_add_action(): + created_expression_alterations = self._handle_expression_alterations( + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_names, + cell_lines + ) + created_cell_lines = self._create_cell_lines( submission_instance, cell_lines, cell_lines_df, created_expression_alterations) if differentiated_cell_lines and differentiated_cell_lines_df is not None: created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df, differentiated) if (undifferentiated_cell_lines and undifferentiated_cell_lines_df is not None and not differentiated): created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( - submission_instance, undifferentiated_cell_lines, undifferentiated_cell_lines_df) + submission_instance, undifferentiated_cell_lines, undifferentiated_cell_lines_df, differentiated) if library_preparations and library_preparations_df is not None: created_library_preparations = self._create_library_preparations( @@ -294,8 +294,8 @@ def _handle_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df, - parent_cell_line_name, - parent_cell_line_id): + parent_cell_line_names, + cell_lines): """Handles the creation of expression alterations and links them to the parent cell line if needed.""" created_expression_alterations = [] @@ -304,14 +304,15 @@ def _handle_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df ) - if created_expression_alterations and parent_cell_line_id: - self._link_parent_cell_line_expression_alteration( - submission_instance, - self.access_token, - parent_cell_line_name, - parent_cell_line_id, - created_expression_alterations - ) + if created_expression_alterations: + for parent_cell_line_name in parent_cell_line_names: + self._link_parent_cell_line_expression_alteration( + submission_instance, + self.access_token, + parent_cell_line_name, + cell_lines, + created_expression_alterations + ) return created_expression_alterations @@ -357,7 +358,7 @@ def _parse_spreadsheet(self, parser): 'Expression alteration', self.action, self.validation_errors ) - cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( + cell_lines, cell_lines_df, parent_cell_line_names = parser.get_cell_lines( cell_line_sheet_name, self.action, self.validation_errors ) @@ -387,7 +388,7 @@ def _parse_spreadsheet(self, parser): self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( - 'Library preparation', self.action, self.validation_errors + 'Library preparation', differentiated, self.action, self.validation_errors ) if differentiated_cell_lines: @@ -412,7 +413,7 @@ def _parse_spreadsheet(self, parser): "expression_alterations_df": expression_alterations_df, "cell_lines": cell_lines, "cell_lines_df": cell_lines_df, - "parent_cell_line_name": parent_cell_line_name, + "parent_cell_line_names": parent_cell_line_names, "differentiated_cell_lines": differentiated_cell_lines, "differentiated_cell_lines_df": differentiated_cell_lines_df, "undifferentiated_cell_lines": undifferentiated_cell_lines, @@ -428,14 +429,16 @@ def _parse_spreadsheet(self, parser): } except Exception as e: print(f"Exception occurred:", e) - + self.validation_errors.append(f"Spreadsheet is invalid {self.file}") return None def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): # Validate the parsed data and upload the file. + """ validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) + """ """ Handle validation errors, including interacting with the user in case of a missing sheet. """ @@ -527,11 +530,13 @@ def _create_cell_lines(self, def _create_differentiated_cell_lines(self, submission_instance, differentiated_cell_lines, - differentiated_cell_lines_df): + differentiated_cell_lines_df, + differentiated): for differentiated_cell_line in differentiated_cell_lines: differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, differentiated_cell_line, differentiated_cell_lines_df, + differentiated, self.submission_envelope_id, self.dataset, self.access_token, @@ -676,12 +681,17 @@ def _link_parent_cell_line_expression_alteration(self, submission_instance, access_token, parent_cell_line_name, - parent_cell_line_id, + cell_lines, created_expression_alterations): - for expression_alteration in created_expression_alterations: - print(f"Linking parent cell line {parent_cell_line_name} " - f"as input to process of {expression_alteration.expression_alteration_id}") - submission_instance.perform_hal_linkage( - f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses", - expression_alteration.id, 'processes', access_token - ) + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + + for cell_line in cell_lines: + if cell_line.parental_cell_line_name == parent_cell_line_name: + for expression_alteration in created_expression_alterations: + if cell_line.expression_alteration_id == expression_alteration.expression_alteration_id: + print(f"Expression alteration match found, Linking parent cell line {parent_cell_line_name} " + f"as input to process of {expression_alteration.expression_alteration_id}") + submission_instance.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses", + expression_alteration.id, 'processes', access_token + ) diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 3b0a05e..46a6abd 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -184,6 +184,7 @@ def __init__(self, description, cell_line_biomaterial_id, # Maps to 'clonal_cell_line_label' differentiated_product_protocol_id, + undifferentiated_product_protocol_id, terminally_differentiated, model_system, timepoint_value, @@ -195,6 +196,7 @@ def __init__(self, self.description = description self.cell_line_biomaterial_id = cell_line_biomaterial_id # Maps to 'clonal_cell_line_label' self.differentiated_product_protocol_id = differentiated_product_protocol_id + self.undifferentiated_product_protocol_id = undifferentiated_product_protocol_id self.terminally_differentiated = terminally_differentiated self.model_system = model_system self.timepoint_value = timepoint_value @@ -216,6 +218,7 @@ def to_dict(self): "description": self.description, "clonal_cell_line_id": self.cell_line_biomaterial_id, "differentiated_product_protocol_id": self.differentiated_product_protocol_id, + "undifferentiated_product_protocol_id": self.undifferentiated_product_protocol_id, "terminally_differentiated": self.terminally_differentiated, "model_system": self.model_system, "timepoint_value": self.timepoint_value, @@ -517,30 +520,33 @@ def merge_cell_line_and_differentiated_cell_line(cell_lines, If a differentiated cell line does not have a corresponding cell line. """ - find_orphans( - source_entities=cell_lines, - target_entities=differentiated_cell_lines, - source_attr="biomaterial_id", - target_attr="cell_line_biomaterial_id", - source_type="Cell line", - target_type="Differentiated Cell line", - errors=errors - ) - - missing_parent_entity_error = MissingParentEntityError() - cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} + try: + find_orphans( + source_entities=cell_lines, + target_entities=differentiated_cell_lines, + source_attr="biomaterial_id", + target_attr="cell_line_biomaterial_id", + source_type="Cell line", + target_type="Differentiated Cell line", + errors=errors + ) - for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.cell_line_biomaterial_id not in cell_line_ids: - missing_parent_entity_error.add_error("Cell Line", - "Differentiated Cell line", - differentiated_cell_line.label, - errors) + missing_parent_entity_error = MissingParentEntityError() + cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} - for cell_line in cell_lines: for differentiated_cell_line in differentiated_cell_lines: - if differentiated_cell_line.cell_line_biomaterial_id == cell_line.biomaterial_id: - cell_line.add_differentiated_cell_line(differentiated_cell_line) + if differentiated_cell_line.cell_line_biomaterial_id not in cell_line_ids: + missing_parent_entity_error.add_error("Cell Line", + "Differentiated Cell line", + differentiated_cell_line.biomaterial_id, + errors) + + for cell_line in cell_lines: + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.cell_line_biomaterial_id == cell_line.biomaterial_id: + cell_line.add_differentiated_cell_line(differentiated_cell_line) + except Exception as e: + print(f"Exception occurred here:", e) class SpreadsheetSubmitter: @@ -688,11 +694,13 @@ def parse_cell_lines(self, if derived_col in df_filtered.columns: parent_cell_line_names = df_filtered[derived_col].dropna().unique() + """ if len(parent_cell_line_names) != 1: errors.append( f"The column '{derived_col}' must have the same value across all rows. Found values: {parent_cell_line_names}") return [], df + """ # Process rows to create CellLine objects cell_lines = [] @@ -728,7 +736,7 @@ def parse_cell_lines(self, ) ) - return cell_lines, df_filtered, parent_cell_line_names[0] + return cell_lines, df_filtered, parent_cell_line_names def parse_differentiated_cell_lines(self, sheet_name, @@ -798,10 +806,11 @@ def parse_differentiated_cell_lines(self, differentiated_cell_lines.append( DifferentiatedCellLine( biomaterial_id=label, - description=row.get('differentiated_product.biomaterial_core.biomaterial_description'), + description=row.get('differentiated_product.description'), cell_line_biomaterial_id=parent_biomaterial_id, differentiated_product_protocol_id=row.get( 'differentiated_product.differentiated_product_protocol_id'), + undifferentiated_product_protocol_id=None, treatment_condition=row.get('differentiated_product.treatment_condition'), wt_control_status=row.get('differentiated_product.wt_control_status'), timepoint_value=row.get('differentiated_product.timepoint_value'), @@ -841,16 +850,16 @@ def parse_undifferentiated_cell_lines(self, # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Check if the required column exists - if 'differentiated_product.label' not in df.columns: - errors.append(f"The column 'differentiated_product.label' does not " + if 'undifferentiated_product.label' not in df.columns: + errors.append(f"The column 'undifferentiated_product.label' does not " f"exist in {sheet_name} name. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null - df = df[df['differentiated_product.label'].notna()] + df = df[df['undifferentiated_product.label'].notna()] df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) # Define columns to check for values starting with 'ABC' or 'XYZ' - cols_to_check = ['differentiated_product.label'] + cols_to_check = ['undifferentiated_product.label'] # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', @@ -861,18 +870,19 @@ def parse_undifferentiated_cell_lines(self, undifferentiated_cell_lines = [] for _, row in df_filtered.iterrows(): - label = row['differentiated_product.label'] - parent_biomaterial_id = row.get('differentiated_product.differentiated_product_protocol_id') + label = row['undifferentiated_product.label'] + parent_biomaterial_id = row.get('clonal_cell_line.label') # Check if biomaterial_id is null if pd.isnull(label): - errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " - "sheet.") + errors.append( + "Undifferentiated Cell line ID cannot be null in any row of the Undifferentiated Cell line " + "sheet.") # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") # Check if derived_accession and cell_type are present if pd.isnull(parent_biomaterial_id): - errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " + errors.append(f"Input Cell line ID cannot be null for Undifferentiated Cell line: " f"{label}") """ raise MissingMandatoryFieldError( @@ -883,16 +893,17 @@ def parse_undifferentiated_cell_lines(self, undifferentiated_cell_lines.append( DifferentiatedCellLine( biomaterial_id=label, - description=row.get('differentiated_product.biomaterial_core.biomaterial_description'), + description=row.get('undifferentiated_product.description'), cell_line_biomaterial_id=parent_biomaterial_id, - differentiated_product_protocol_id=row.get( - 'differentiated_product.differentiated_product_protocol_id'), - treatment_condition=row.get('differentiated_product.treatment_condition'), - wt_control_status=row.get('differentiated_product.wt_control_status'), - timepoint_value=row.get('differentiated_product.timepoint_value'), - timepoint_unit=row.get('differentiated_product.timepoint_unit'), - terminally_differentiated=row.get('differentiated_product.terminally_differentiated'), - model_system=row.get('differentiated_product.model_system'), + differentiated_product_protocol_id=None, + undifferentiated_product_protocol_id=row.get( + 'undifferentiated_product.undifferentiated_product_protocol_id'), + treatment_condition=row.get('undifferentiated_product.treatment_condition'), + wt_control_status=row.get('undifferentiated_product.wt_control_status'), + timepoint_value=row.get('undifferentiated_product.timepoint_value'), + timepoint_unit=row.get('undifferentiated_product.timepoint_unit'), + terminally_differentiated=row.get('undifferentiated_product.terminally_differentiated'), + model_system=row.get('undifferentiated_product.model_system'), id=row.get('Id') ) ) @@ -901,6 +912,7 @@ def parse_undifferentiated_cell_lines(self, def parse_library_preparations(self, sheet_name, + differentiated, action, errors): """ @@ -925,15 +937,28 @@ def parse_library_preparations(self, required_columns = [ 'library_preparation.label', 'differentiated_product.label', + 'undifferentiated_product.label', 'library_preparation.library_preparation_protocol_id' ] for col in required_columns: if col not in df.columns: - errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " - f"The rest of the file will not be processed") + if col == 'differentiated_product.label' and differentiated: + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") - return [], df + return [], df + elif col == 'undifferentiated_product.label' and not differentiated: + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df + else: + if col not in ('differentiated_product.label', 'undifferentiated_product.label'): + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df # Filter rows where biomaterial_id is not null df = df[df['library_preparation.label'].notna()] @@ -951,7 +976,10 @@ def parse_library_preparations(self, for _, row in df_filtered.iterrows(): label = row['library_preparation.label'] - differentiated_biomaterial_label = row.get('differentiated_product.label') + if differentiated: + differentiated_biomaterial_label = row.get('differentiated_product.label') + else: + differentiated_biomaterial_label = row.get('undifferentiated_product.label') library_preparation_protocol_id = row.get('library_preparation.library_preparation_protocol_id') # Check if required fields are null @@ -959,8 +987,14 @@ def parse_library_preparations(self, errors.append("Library Preparation ID cannot be null in any row of the Library Preparation sheet.") # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") if pd.isnull(differentiated_biomaterial_label): - errors.append("Differentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") - # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") + if differentiated: + errors.append( + "Differentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") + else: + errors.append( + "Undifferentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") if pd.isnull(library_preparation_protocol_id): errors.append( "Library Preparation Protocol ID cannot be null in any row of the Library Preparation sheet.") @@ -1177,8 +1211,8 @@ def get_cell_lines(self, list A list of CellLine objects parsed from the specified sheet. """ - cell_lines, cell_lines_df, parent_cell_line_name = self.parse_cell_lines(sheet_name, action, errors) - return cell_lines, cell_lines_df, parent_cell_line_name + cell_lines, cell_lines_df, parent_cell_line_names = self.parse_cell_lines(sheet_name, action, errors) + return cell_lines, cell_lines_df, parent_cell_line_names def get_differentiated_cell_lines(self, sheet_name, @@ -1229,6 +1263,7 @@ def get_undifferentiated_cell_lines(self, def get_library_preparations(self, sheet_name, + differentiated, action, errors): """ @@ -1246,7 +1281,7 @@ def get_library_preparations(self, list A list of LibraryPreparation objects parsed from the specified sheet. """ - library_preparations, df_filtered = self.parse_library_preparations(sheet_name, + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, differentiated, action, errors) return library_preparations, df_filtered