diff --git a/README.md b/README.md index a846744..0c4f470 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # morphic-util -CLI tool for uploading data to the Morphic AWS S3 buckets. +CLI tool for submitting analysis data and metadata # Users @@ -9,8 +9,8 @@ CLI tool for uploading data to the Morphic AWS S3 buckets. Users need to have 1. Basic command-line knowledge -2. Python3.x installed on their machine -3. AWS Cognito username and password +2. Python 3.10 installed on their machine +3. AWS Cognito username or email and password ## Install @@ -35,8 +35,10 @@ optional arguments: --version, -v show program's version number and exit command: - {config,create,select,list,upload,download,delete} + {config,submit,submit-file,create,select,list,upload,download,delete} config configure AWS credentials + submit submit your study, dataset or biomaterials metadata (incomplete as all metadata types is not supported yet, expected to be completed on August 2024) + submit-file submit your metadata file containing your cell lines, differentiated cell lines, library preparations and sequencing files create create an upload area (authorised users only) select select or show the active upload area list list contents of the area @@ -79,18 +81,40 @@ positional arguments: password AWS Cognito password ``` -The tool uses the profile name _hca-util_ in local AWS config files. +The tool uses the profile name _morphic-util_ in local AWS config files. + +## `submit` command +Submit your study and dataset metadata and create your AWS upload area for uploading data files + +```shell script +positional arguments: +$ morphic-util submit --type --file + + --type type of metadata being submitted (e.g. study or dataset) + --file path to the file containing the metadata +``` + +## `submit-file` command +Submit your study and dataset metadata and create your AWS upload area for uploading data files + +```shell script +positional arguments: +$ morphic-util submit-file --file --action --dataset + + --file path to the file containing the metadata + --action ADD, MODIFY or DELETE based on the type of submission + --dataset the identifier for the analysis +``` ## `create` command Create an upload area/ project folder **(authorised users only)** ```shell script -$ morphic-util create NAME DPC [-p {u,ud,ux,udx}] +$ morphic-util create NAME [-p {u,ud,ux,udx}] positional arguments: NAME name for the new area/ project folder - DPC center name of the submitter optional arguments: -p {u,ud,ux,udx} allowed actions (permissions) on new area. u for @@ -161,6 +185,64 @@ optional arguments: -a delete all files from the area -d delete upload area and contents (authorised users only) ``` +## Performing a submission +### Authenticate +```shell script +$ morphic-util config username password + +positional arguments: + username AWS Cognito username + password AWS Cognito password +``` +### Create your study +```shell script +positional arguments: +$ morphic-util submit --type study --file + + --type type of metadata being submitted (here it is study) + --file path to the file containing the metadata +``` +### Create your dataset and link it to your study +```shell script +positional arguments: +$ morphic-util submit --type dataset --file --study + + --type type of metadata being submitted (here it is dataset) + --file path to the file containing the metadata (optional) + --study STUDY_ID obtained in the last step +``` +### `select` your upload area to upload your data files (the upload area name is same as your DATASET_ID) +Show or select the data file upload area +```shell script +$ morphic-util select AREA + +positional arguments: + AREA upload area name (same as DATASET_ID obtained in the last step). +``` +### `upload` your data files +Upload files to the selected area for the dataset +```shell script +$ morphic-util upload PATH [PATH ...] [-o] + +positional arguments: + PATH valid file or directory + +optional arguments: + -o overwrite files with same names +``` +### `list` uploaded data files to verify that data file upload has been successful +```shell script +$ morphic-util list +``` +### `submit-file` command to submit your dataset metadata containing your biomaterials, processes, protocols and files +```shell script +positional arguments: +$ morphic-util submit-file --file --action --dataset + + --file path to the file containing the metadata + --action ADD, MODIFY or DELETE based on the type of submission + --dataset the identifier for the analysis +``` # Developers diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py index 8034ae4..efa2d21 100755 --- a/ait/commons/util/__main__.py +++ b/ait/commons/util/__main__.py @@ -79,6 +79,21 @@ def parse_args(args): parser_config.add_argument('PASSWORD', help='AWS Cognito password', nargs='?') parser_config.add_argument('--bucket', help='use BUCKET instead of default bucket') + parser_config = cmd_parser.add_parser('submit', help='submit your metadata') + parser_config.add_argument('--type', help='data type you are submitting, e.g. study, dataset') + parser_config.add_argument('--file', help='your metadata') + parser_config.add_argument('--study', help='your study reference') + parser_config.add_argument('--dataset', help='your dataset reference') + parser_config.add_argument('--process', help='your process/analysis reference') + + parser_config = cmd_parser.add_parser('submit-file', help='submit your file containing your dataset metadata') + parser_config.add_argument('--file', help='spreadsheet containing your dataset metadata') + parser_config.add_argument('--action', help='action you want to perform (ADD/MODIFY/DELETE') + parser_config.add_argument('--dataset', help='your dataset reference') + + parser_config = cmd_parser.add_parser('view', help='view your dataset') + parser_config.add_argument('--dataset', help='your dataset reference') + parser_create = cmd_parser.add_parser('create', help='create an upload area (authorised users only)') parser_create.add_argument('NAME', help='name for the new area', type=valid_project_name) parser_create.add_argument('DPC', help='center name of the submitter', type=valid_project_name) @@ -98,7 +113,8 @@ def parse_args(args): # parser_clear.add_argument('-a', action='store_true', help='clear all - selection and known dirs') parser_list = cmd_parser.add_parser('list', help='list contents of the area') - parser_list.add_argument('-b', action='store_true', help='list all areas in the S3 bucket (authorised users only)') + parser_list.add_argument('-processing', action='store_true', help='access the processed data (authorised users ' + 'only)') # parser_upload = cmd_parser.add_parser('upload', help='upload files to the area') # group_upload = parser_upload.add_mutually_exclusive_group(required=True) @@ -128,7 +144,8 @@ def parse_args(args): group_delete.add_argument('-d', action='store_true', help='delete upload area and contents (authorised users only)') parser_sync = cmd_parser.add_parser('sync', - help='copy data from selected upload area to ingest upload area (authorised users only)') + help='copy data from selected upload area to ingest upload area (authorised ' + 'users only)') parser_sync.add_argument('INGEST_UPLOAD_AREA', help='Ingest upload area', type=valid_ingest_upload_area) ps = [parser] diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 38b069a..dea6c66 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -4,7 +4,7 @@ from ait.commons.util.aws_cognito_authenticator import AwsCognitoAuthenticator from ait.commons.util.settings import AWS_SECRET_NAME_AK_BUCKET, AWS_SECRET_NAME_SK_BUCKET, \ - AWS_SECRET_NAME_MORPHIC_BUCKET, COGNITO_MORPHIC_UTIL_ADMIN, S3_REGION + COGNITO_MORPHIC_UTIL_ADMIN, S3_REGION def static_bucket_name(): @@ -14,7 +14,7 @@ def static_bucket_name(): class Aws: def __init__(self, user_profile): - self.is_user = False # not admin + self.is_user = True # not admin self.user_dir_list = None self.center_name = None self.secret_key = None @@ -42,21 +42,23 @@ def get_bucket_name(self, secret_mgr_client): """ # access policy can't be attached to a secret # GetSecretValue action should be allowed for user - resp = secret_mgr_client.get_secret_value(SecretId=AWS_SECRET_NAME_MORPHIC_BUCKET) + resp = secret_mgr_client.get_secret_value(SecretId='') secret_str = resp['SecretString'] self.bucket_name = json.loads(secret_str)['s3-bucket'] return self.bucket_name def new_session(self): aws_cognito_authenticator = AwsCognitoAuthenticator(self) - secret_manager_client = aws_cognito_authenticator.get_secret_manager_client(self.user_profile.username, - self.user_profile.password) + secret_manager_client = aws_cognito_authenticator.secret_manager_client_instance(self.user_profile.username, + self.user_profile.password) if secret_manager_client is None: - print('Failure while re-establishing Amazon Web Services session, report this error to the DRACC admin') + print( + 'Failure while re-establishing Amazon Web Services session, report this error to the MorPhiC DRACC ' + 'admin') raise Exception else: - self.is_user = aws_cognito_authenticator.is_valid_user() + self.is_user = aws_cognito_authenticator.is_user self.user_dir_list = aws_cognito_authenticator.get_user_dir_list() self.center_name = aws_cognito_authenticator.get_center_name() @@ -85,22 +87,35 @@ def is_valid_credentials(self): def is_valid_user(self): return self.is_user - def obj_exists(self, key): + def s3_bucket_exists(self, key): """ - return true if key exists, else false - A folder/directory is an s3 object with key / - Note: s3://my-bucket/folder != s3://my-bucket/folder/ - Refer to https://www.peterbe.com/plog/fastest-way-to-find-out-if-a-file-exists-in-s3 - for comparison between client.list_objects_v2 and client.head_object to make this check. - Also check https://stackoverflow.com/questions/33842944/check-if-a-key-exists-in-a-bucket-in-s3-using-boto3 - which suggests using Object.load() - which does a HEAD request, however, user doesn't have - s3:GetObject permission by default, so this will fail for them. + Returns True if the bucket exists, else False. """ - response = self.new_session().client('s3').list_objects_v2( - Bucket=self.bucket_name, - Prefix=key, - ) - for obj in response.get('Contents', []): - if obj['Key'] == key: - return True - return False + client = self.common_session.client('s3') + try: + client.head_bucket( + Bucket=key + ) + return True + except client.exceptions.NoSuchBucket as e: + print(f"The bucket '{key}' does not exist. Reason: {e}") + return False + + def data_file_exists(self, bucket_name, key): + """ + Check if an object exists in the specified S3 bucket. + + Parameters: + - bucket_name (str): The name of the S3 bucket. + - key (str): The key of the object in the bucket. + + Returns: + - bool: True if the object exists, False otherwise. + """ + client = self.common_session.client('s3') + + try: + client.head_object(Bucket=bucket_name, Key=key) + return True + except client.exceptions.ClientError: + return False diff --git a/ait/commons/util/aws_cognito_authenticator.py b/ait/commons/util/aws_cognito_authenticator.py index 7527a97..3ae21e0 100644 --- a/ait/commons/util/aws_cognito_authenticator.py +++ b/ait/commons/util/aws_cognito_authenticator.py @@ -1,7 +1,5 @@ import sys - import boto3 - from ait.commons.util.settings import DEFAULT_PROFILE, DEFAULT_REGION, COGNITO_CLIENT_ID, COGNITO_IDENTITY_POOL_ID, \ COGNITO_USER_POOL_ID from ait.commons.util.user_profile import set_profile @@ -14,12 +12,11 @@ class AwsCognitoAuthenticator: def __init__(self, args): self.args = args - self.is_user = False # not admin + self.is_user = True # not admin self.user_dir_list = None self.center_name = None # custom attribute DPC - def validate_cognito_identity(self, profile, username, password): - + def is_registered_user(self, profile, username, password): try: profile = profile if profile else DEFAULT_PROFILE @@ -64,7 +61,7 @@ def validate_cognito_identity(self, profile, username, password): if session_token: set_profile(profile, DEFAULT_REGION, aws_cred['AccessKeyId'], aws_cred['SecretKey'], - session_token, username, password) + session_token, access_token, username, password) return True else: @@ -74,8 +71,7 @@ def validate_cognito_identity(self, profile, username, password): except Exception as e: return False - def get_secret_manager_client(self, username, password): - + def secret_manager_client_instance(self, username, password): try: if username and password: client = boto3.client("cognito-idp", region_name=DEFAULT_REGION, aws_access_key_id="NONE", @@ -90,40 +86,14 @@ def get_secret_manager_client(self, username, password): # Getting the user details. access_token = response["AuthenticationResult"]["AccessToken"] id_token = response["AuthenticationResult"]["IdToken"] - response = client.get_user(AccessToken=access_token) - username = response['Username'] - user_attribute_list = response['UserAttributes'] if username.endswith('Admin') or username.endswith('admin'): self.is_user = False else: self.is_user = True - for attr in user_attribute_list: - if attr['Name'] == 'custom:DPC': - self.center_name = attr['Value'].lower() - - if attr['Name'] == 'custom:directory_access': - self.user_dir_list = attr['Value'].replace(" ", "").split(',') - - if self.user_dir_list is not None: - self.user_dir_list = ['morphic-' + self.center_name + '/' + dataset_dir for dataset_dir in - self.user_dir_list] - - if self.is_user: - if self.center_name is None: - print('User does not have an assigned center name and therefore cannot perform any operations ' - 'with this system') - sys.exit(1) - - if self.user_dir_list is None: - if self.is_user: - print('User does not have access to any upload areas or to perform any operations with this' - 'system') - sys.exit(1) - identity = boto3.client('cognito-identity', region_name=DEFAULT_REGION) identity_id = identity.get_id( @@ -156,7 +126,7 @@ def get_secret_manager_client(self, username, password): except Exception as e: return None - def is_valid_user(self): + def is_user(self): return self.is_user def get_user_dir_list(self): diff --git a/ait/commons/util/bucket_policy.py b/ait/commons/util/bucket_policy.py index 2ea4afa..8734783 100755 --- a/ait/commons/util/bucket_policy.py +++ b/ait/commons/util/bucket_policy.py @@ -1,4 +1,4 @@ -from ait.commons.util.settings import AWS_ACCOUNT, IAM_USER +from ait.commons.util.settings import IAM_USER """ User groups: @@ -50,6 +50,7 @@ ALLOWED_PERMS = ['u', 'ud', 'ux', 'udx'] DEFAULT_PERMS = 'ux' + # constraints - in bucket policy # ux denyDelete -> u # ux allowDownload -> udx @@ -57,18 +58,19 @@ def allowDownloadStmt(): return { - "Sid": "AllowDownload", - "Effect": "Allow", - "Action": "s3:GetObject", - "Resource": [], - "Principal": { "AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} -} + "Sid": "AllowDownload", + "Effect": "Allow", + "Action": "s3:GetObject", + "Resource": [], + "Principal": {"AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} + } + def denyDeleteStmt(): return { - "Sid": "DenyDelete", - "Effect": "Deny", - "Action": "s3:DeleteObject", - "Resource": [], - "Principal": { "AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} -} + "Sid": "DenyDelete", + "Effect": "Deny", + "Action": "s3:DeleteObject", + "Resource": [], + "Principal": {"AWS": [f"arn:aws:iam::{AWS_ACCOUNT}:user/{IAM_USER}"]} + } diff --git a/ait/commons/util/cmd.py b/ait/commons/util/cmd.py index dfff6ea..31f0167 100644 --- a/ait/commons/util/cmd.py +++ b/ait/commons/util/cmd.py @@ -5,13 +5,16 @@ from ait.commons.util.aws_client import Aws, static_bucket_name from ait.commons.util.command.config import CmdConfig -from ait.commons.util.command.create import CmdCreate +from ait.commons.util.command.create import run from ait.commons.util.command.delete import CmdDelete from ait.commons.util.command.download import CmdDownload from ait.commons.util.command.list import CmdList from ait.commons.util.command.select import CmdSelect +from ait.commons.util.command.submit import CmdSubmit +from ait.commons.util.command.submit_file import CmdSubmitFile from ait.commons.util.command.sync import CmdSync from ait.commons.util.command.upload import CmdUpload +from ait.commons.util.command.view import CmdView from ait.commons.util.local_state import get_bucket, set_attr, get_attr from ait.commons.util.settings import NAME, VERSION from ait.commons.util.user_profile import profile_exists, get_profile @@ -37,6 +40,18 @@ def __init__(self, args): success, msg = CmdConfig(args).run() print(msg) + elif args.command == 'submit': + success, msg = CmdSubmit(args).run() + print(msg) + + elif args.command == 'submit-file': + success, msg = CmdSubmitFile(args).run() + print(msg) + + elif args.command == 'view': + success, msg = CmdView(args).run() + print(msg) + else: if profile_exists(args.profile): self.user_profile = get_profile(args.profile) @@ -82,7 +97,7 @@ def check_version(self): def execute(self, args): if args.command == 'create': - success, msg = CmdCreate(self.aws, args).run() + success, msg = run() self.exit(success, msg) elif args.command == 'select': diff --git a/ait/commons/util/command/config.py b/ait/commons/util/command/config.py index 43ac758..87119e4 100755 --- a/ait/commons/util/command/config.py +++ b/ait/commons/util/command/config.py @@ -15,22 +15,26 @@ def __init__(self, args): def run(self): + global valid_user + try: profile = self.args.profile if self.args.profile else DEFAULT_PROFILE + aws_cognito_authenticator = AwsCognitoAuthenticator(self) + # TODO: review the below bucket in args if self.args.bucket: set_bucket(self.args.bucket) if self.args.USERNAME and self.args.PASSWORD: - aws_cognito_authenticator = AwsCognitoAuthenticator(self) - - valid_user = aws_cognito_authenticator.validate_cognito_identity(profile, self.args.USERNAME, - self.args.PASSWORD) - - # check if valid user - if valid_user: - return True, 'Valid credentials' - else: - return False, 'Invalid credentials' + valid_user = aws_cognito_authenticator.is_registered_user(profile, self.args.USERNAME, + self.args.PASSWORD) + else: + print("No credentials provided!") + + # check if valid user + if valid_user: + return True, 'Valid credentials' + else: + return False, 'Invalid credentials' except Exception as e: return False, format_err(e, 'config') diff --git a/ait/commons/util/command/create.py b/ait/commons/util/command/create.py index e2e5da6..7a0bf9b 100644 --- a/ait/commons/util/command/create.py +++ b/ait/commons/util/command/create.py @@ -1,10 +1,10 @@ -import json +from ait.commons.util.aws_client import Aws -from botocore.exceptions import ClientError -from ait.commons.util.aws_client import Aws -from ait.commons.util.bucket_policy import DEFAULT_PERMS, allowDownloadStmt, denyDeleteStmt -from ait.commons.util.common import format_err +# TODO: review +def run(): + return False, ('create is no longer supported as upload areas (buckets) ' + 'are created while metadata submission') class CmdCreate: @@ -16,73 +16,3 @@ class CmdCreate: def __init__(self, aws: Aws, args): self.aws = aws self.args = args - - def run(self): - if not self.aws: - return False, 'You need configure your profile first' - - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' - - area_name = self.args.NAME # S3 bucket folder name - perms = self.args.p # optional str, default 'ux' - center_name = self.args.DPC # morphic DPC - - try: - s3_client = self.aws.common_session.client('s3') - # new upload areas to be created with tagging instead of metadata - # upload area format - morphic-DPC/area_name/ - s3_client.put_object(Bucket=self.aws.bucket_name, - Key=('morphic-' + center_name.lower() + '/' + area_name + '/'), - Tagging=f'name={area_name}&perms={perms}') - - if perms == DEFAULT_PERMS: - pass # default perms as set in user policy (ux) applies - no need for further actions (deny or allow) - else: - # get bucket policy - bucket_policy = self.aws.common_session.resource('s3').BucketPolicy(self.aws.bucket_name) - try: - policy_str = bucket_policy.policy - except ClientError: - policy_str = '' - - if policy_str: - policy_json = json.loads(policy_str) - else: # no bucket policy - policy_json = json.loads('{ "Version": "2012-10-17", "Statement": [] }') - - allow_stmt = None - deny_stmt = None - - for stmt in policy_json['Statement']: - if stmt['Sid'] == 'AllowDownload': - allow_stmt = stmt - elif stmt['Sid'] == 'DenyDelete': - deny_stmt = stmt - - if 'd' in perms: # e.g 'ud' or 'udx' - # allow download - self.update_perms(policy_json, allow_stmt, allowDownloadStmt(), area_name) - - if 'x' not in perms: # e.g. 'u' or 'ud' - # deny delete - self.update_perms(policy_json, deny_stmt, denyDeleteStmt(), area_name) - - try: - bucket_policy.put(Policy=json.dumps(policy_json)) - except ClientError: - pass - - return True, 'Created upload area with name ' + area_name + ' for ' + center_name + ' DPC' - - except Exception as e: - return False, format_err(e, 'create') - - def update_perms(self, policy, stmt, template, area): - if not stmt: - stmt = template - policy['Statement'].append(stmt) - if isinstance(stmt['Resource'], str): - stmt['Resource'] = [stmt['Resource']] + [f'arn:aws:s3:::{self.aws.bucket_name}/{area}/*'] - elif isinstance(stmt['Resource'], list): - stmt['Resource'].append(f'arn:aws:s3:::{self.aws.bucket_name}/{area}/*') diff --git a/ait/commons/util/command/delete.py b/ait/commons/util/command/delete.py index 637ca49..4f139e3 100644 --- a/ait/commons/util/command/delete.py +++ b/ait/commons/util/command/delete.py @@ -1,8 +1,3 @@ -import json - -from botocore.exceptions import ClientError - -from ait.commons.util.command.area import CmdArea from ait.commons.util.common import format_err from ait.commons.util.local_state import get_selected_area @@ -13,6 +8,7 @@ ''' + class CmdDelete: """ both admin and user, though user can't delete folder @@ -31,35 +27,15 @@ def run(self): return False, 'No area selected' try: - if self.args.d: # delete area - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' + if self.args.a: # delete all files - confirm = input(f'Confirm delete upload area {selected_area}? Y/y to proceed: ') + confirm = input(f'Confirm delete all contents from {selected_area}? Y/y to proceed: ') if confirm.lower() == 'y': print('Deleting...') - deleted_keys = self.delete_upload_area(selected_area, incl_selected_area=True) - for k in deleted_keys: - print(k) - - # delete bucket policy for user-folder permissions - # only admin who has perms to set policy can do this - self.clear_area_perms_from_bucket_policy(selected_area) - - # clear selected area - CmdArea.clear(False) - return True, None - - if self.args.a: # delete all files - - confirm = input(f'Confirm delete all contents from {selected_area}? Y/y to proceed: ') - - if confirm.lower() == 'y': - print('Deleting...') + deleted_keys = self.delete_all_files_from_s3_bucket(selected_area, incl_selected_area=False) - deleted_keys = self.delete_upload_area(selected_area, incl_selected_area=False) for k in deleted_keys: print(k) @@ -67,17 +43,18 @@ def run(self): if self.args.PATH: # list of files and dirs to delete print('Deleting...') + for p in self.args.PATH: # you may have perm x but not d (to load or even do a head object) # so use obj_exists - prefix = selected_area + p - keys = self.all_keys(prefix) + prefix = p + keys = self.all_keys(selected_area, prefix) if keys: for k in keys: try: - self.delete_s3_object(k) + self.delete_single_file_from_s3_bucket(selected_area, k) print(k + ' Done.') except Exception as ex: if 'AccessDenied' in str(ex): @@ -94,73 +71,31 @@ def run(self): return False, format_err(e, 'delete') # based on obj_exists method - def all_keys(self, prefix): + def all_keys(self, selected_area, prefix): keys = [] response = self.aws.common_session.client('s3').list_objects_v2( - Bucket=self.aws.bucket_name, + Bucket=selected_area, Prefix=prefix, ) for obj in response.get('Contents', []): keys.append(obj['Key']) - + return keys - def delete_s3_object(self, key): + def delete_single_file_from_s3_bucket(self, selected_area, key): s3_resource = self.aws.common_session.resource('s3') - s3_obj = s3_resource.ObjectSummary(self.aws.bucket_name, key) + s3_obj = s3_resource.ObjectSummary(selected_area, key) s3_obj.delete() return key - def delete_upload_area(self, selected_area, incl_selected_area=False): + def delete_all_files_from_s3_bucket(self, selected_area, incl_selected_area=False): s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) + bucket = s3_resource.Bucket(selected_area) deleted_keys = [] - objs_to_delete = bucket.objects.filter(Prefix=selected_area) if incl_selected_area else filter(lambda obj: obj.key != selected_area, bucket.objects.filter(Prefix=selected_area)) + objs_to_delete = bucket.objects.filter() if incl_selected_area else filter( + lambda obj: obj.key != selected_area, bucket.objects.filter()) for obj in objs_to_delete: obj.delete() deleted_keys.append(obj.key) return deleted_keys - - def clear_area_perms_from_bucket_policy(self, selected_area): - s3_resource = self.aws.common_session.resource('s3') - return CmdDelete.delete_dir_perms_from_bucket_policy(s3_resource, self.aws.bucket_name, selected_area) - - @staticmethod - def delete_dir_perms_from_bucket_policy(s3_res, bucket_name, area_name): - bucket_policy = s3_res.BucketPolicy(bucket_name) - try: - policy_str = bucket_policy.policy # throws NoSuchBucketPolicy - except ClientError: - policy_str = '' - - if policy_str: - policy = json.loads(policy_str) - policy_updated = False - - # remove any statement affecting single resource - # (this also maintains backward compatibility with the previous way of adding - # a statement per upload area) - for stmt in policy['Statement']: - - if isinstance(stmt['Resource'], str) and area_name in stmt['Resource']: - policy_updated = True - policy['Statement'].remove(stmt) # cannot modify if removing item while iterating over list - - # now check statement with resource list - for stmt in policy['Statement']: - if isinstance(stmt['Resource'], list): - # remove resource from resource list of statement but not statement - for res in stmt['Resource']: - if area_name in res: - policy_updated = True - stmt['Resource'].remove(res) - - if policy_updated: - try: - if policy['Statement']: - bucket_policy.put(Policy=json.dumps(policy)) # throws MalformedPolicy (policy document exceeds the maximum allowed size of 20480 bytes) - else: - bucket_policy.delete() - except ClientError: - pass diff --git a/ait/commons/util/command/download.py b/ait/commons/util/command/download.py index 983dbaf..f537ff8 100755 --- a/ait/commons/util/command/download.py +++ b/ait/commons/util/command/download.py @@ -30,7 +30,7 @@ def run(self): try: s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) + bucket = s3_resource.Bucket(selected_area) # choice 1 all_files = self.args.a # optional bool diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index 853319e..cf79917 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -1,5 +1,56 @@ +import hashlib +import csv + from ait.commons.util.common import format_err from ait.commons.util.local_state import get_selected_area +from ait.commons.util.user_profile import get_profile +from urllib.parse import urlparse + + +def print_area(k, area): + print(k, end=' ') + p = '' + + if 'perms' in area: + p = area.get('perms') or '' + print(p.ljust(3), end=' ') + + if 'md5' in area: + p = area.get('md5') or '' + print(p.ljust(3), end=' ') + + if 'name' in area: + n = area.get('name') + print(f'{n}' if n else '', end=' ') + print() + + +def get_s3_path(): + while True: + s3_path = input("Enter the S3 path (e.g., s3://bucket-name/folder/): ").strip() + parsed_url = urlparse(s3_path) + + if parsed_url.scheme == 's3' and parsed_url.netloc: + return s3_path + else: + print("Invalid S3 path. Please enter a valid S3 path starting with 's3://'.") + + +def calculate_md5(s3_client, bucket_name, key): + md5_hash = hashlib.md5() + + try: + # Stream the object in chunks + response = s3_client.get_object(Bucket=bucket_name, Key=key) + + for chunk in response['Body'].iter_chunks(chunk_size=8192): + md5_hash.update(chunk) + + return md5_hash.hexdigest() + except Exception as e: + print(f"Failed to compute MD5 for {key}: {e}") + + return None class CmdList: @@ -11,111 +62,125 @@ class CmdList: def __init__(self, aws, args): self.aws = aws self.args = args + self.user = get_profile('morphic-util').username + self.processing = getattr(self.args, 'processing', None) self.s3_cli = self.aws.common_session.client('s3') def run(self): + if self.processing: + if self.user != 'morphic-admin': + return False, "Admin function only" + else: + print("Access granted") - if self.args.b: # list all areas in bucket - if self.aws.is_user: - return False, 'You don\'t have permission to use this command' + s3_path = get_s3_path() + self.list_s3_files(s3_path) - try: - folder_count = 0 - for area in self.list_bucket_areas(): - k = area["key"] - self.print_area(k, area) - folder_count += 1 - print_count(folder_count) return True, None - except Exception as e: - return False, format_err(e, 'list') - - else: # list selected area contents - selected_area = get_selected_area() + else: + selected_area = get_selected_area() # select area is a S3 bucket if not selected_area: return False, 'No area selected' - else: - if self.aws.is_user: - dir_prefix = 'morphic-' + self.aws.center_name + '/' - if dir_prefix not in selected_area: - selected_area = dir_prefix + selected_area + try: + self.list_bucket_contents(selected_area) + # print_count(folder_count + files_count) + return True, None + + except Exception as e: + return False, format_err(e, 'list') + + def list_s3_files(self, s3_path): + parsed_url = urlparse(s3_path) + bucket_name = parsed_url.netloc + prefix = parsed_url.path.lstrip('/') + output_file = 's3_file_md5s.tsv' - if selected_area.rstrip(selected_area[-1]) not in self.aws.user_dir_list: - return False, "Upload area does not exist or you don't have access to this area" + with open(output_file, 'w', newline='') as csvfile: + tsv_writer = csv.writer(csvfile, delimiter=',') + tsv_writer.writerow(['File Name', 'MD5 Hash']) # Write header row try: - selected_area += '' if selected_area.endswith('/') else '/' - n, p = self.get_name_and_perms(selected_area) - self.print_area(selected_area, dict(name=n, perms=p)) + response = self.s3_cli.list_objects_v2(Bucket=bucket_name, Prefix=prefix) - file_count = 0 - for k in self.list_area_contents(selected_area): - print(k) - if not k.endswith('/'): - file_count += 1 + if 'Contents' in response: + print(f"\nFiles in '{s3_path}'") - print_count(file_count) - return True, None + for obj in response['Contents']: + file_key = obj['Key'] + if not file_key.endswith('/'): # Skip folders + md5_hash = calculate_md5(self.s3_cli, bucket_name, file_key) + + if md5_hash: + print(f"{file_key} - MD5: {md5_hash}") + tsv_writer.writerow([file_key, md5_hash]) # Write to file + else: + print("\nNo files found.") except Exception as e: - return False, format_err(e, 'list') + print(f"\nError: {e}") - def print_area(self, k, area): - print(k, end=' ') - p = '' - if 'perms' in area: - p = area.get('perms') or '' - print(p.ljust(3), end=' ') - if 'name' in area: - n = area.get('name') - print(f'{n}' if n else '', end=' ') - print() - - def get_name_and_perms(self, k): - n, p = None, None - try: - tagSet = self.s3_cli.get_object_tagging(Bucket=self.aws.bucket_name, Key=k) - - if tagSet and tagSet['TagSet']: - kv = dict((tag['Key'], tag['Value']) for tag in tagSet['TagSet']) - n = kv.get('name', None) - p = kv.get('perms', None) - else: # for backward compatibility get name and perms from metadata - if not self.aws.is_user: # only admin can retrieve metadata (head_object) - resp = self.s3_cli.head_object(Bucket=self.aws.bucket_name, Key=k) - if resp and resp['Metadata']: - meta = resp['Metadata'] - n = meta.get('name', None) - p = meta.get('perms', None) - except: - pass - return n, p - - def list_bucket_areas(self): - areas = [] - result = self.s3_cli.list_objects_v2(Bucket=self.aws.bucket_name, Delimiter='/') + print(f"\nResults saved to {output_file}") + + def list_bucket_contents(self, selected_area, prefix=''): + result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) + # Folders dirs = result.get('CommonPrefixes', []) + for d in dirs: k = d.get('Prefix') - n, p = self.get_name_and_perms(k) - areas.append(dict(key=k, name=n, perms=p)) - return areas - - def list_area_contents(self, selected_area): - contents = [] - - s3_resource = self.aws.common_session.resource('s3') - bucket = s3_resource.Bucket(self.aws.bucket_name) - - for obj in bucket.objects.filter(Prefix=selected_area): - k = obj.key - if k != selected_area: - contents.append(k) - - return contents + print_area(k, {'key': k, 'md5': None, 'perms': 'dir'}) + self.list_bucket_contents(selected_area, prefix=k) + + # Files + files = result.get('Contents', []) + + for f in files: + k = f.get('Key') + head_object_response = self.s3_cli.head_object(Bucket=selected_area, Key=k) + metadata = head_object_response.get('Metadata', {}) + hash_md5 = metadata.get('md5', 'MD5 checksum not found') + print_area(k, {'key': k, 'md5': hash_md5, 'perms': 'file'}) + + def list_bucket_contents_and_return(self, selected_area, prefix=''): + """ + Lists the contents of an S3 bucket and returns a list of file keys. + + Parameters: + - selected_area: The S3 bucket name. + - prefix: The prefix to filter objects by (default is empty string, which lists all objects). + + Returns: + - A list of file keys in the bucket. + """ + file_keys = [] # Initialize an empty list to store file keys. + + # Define the recursive function to list bucket contents. + def _list_bucket_contents(bucket, prefix): + # Call AWS S3 API to list objects with a specific prefix. + result = self.s3_cli.list_objects_v2(Bucket=bucket, Delimiter='/', Prefix=prefix) + + # Handle directories (folders) first. + dirs = result.get('CommonPrefixes', []) + for d in dirs: + k = d.get('Prefix') + # Recursively call the function to list contents of the subdirectory. + _list_bucket_contents(bucket, prefix=k) + + # Handle files at the current prefix level. + files = result.get('Contents', []) + for f in files: + k = f.get('Key') + # Add each file key to the list. + file_keys.append(k) + + # Start the recursive process to list all contents from the given prefix. + _list_bucket_contents(selected_area, prefix) + + # Return the final list of all file keys found in the bucket. + return file_keys def print_count(count): diff --git a/ait/commons/util/command/select.py b/ait/commons/util/command/select.py index 60f3ff5..833fd52 100644 --- a/ait/commons/util/command/select.py +++ b/ait/commons/util/command/select.py @@ -15,21 +15,11 @@ def __init__(self, aws, args): def run(self): try: if self.args.AREA: - key = self.args.AREA if self.args.AREA.endswith('/') else f'{self.args.AREA}/' + key = self.args.AREA - if self.aws.is_user: - key = 'morphic-' + self.aws.center_name + '/' + key - - if self.aws.obj_exists(key): - if not self.aws.is_user: - set_selected_area(key) - return True, f'Selected upload area is {key}' - else: - if key.rstrip(key[-1]) in self.aws.user_dir_list: - set_selected_area(key) - return True, f'Selected upload area is {key}' - else: - return False, f'Upload area does not exist or you do not have access to this area - {key}' + if self.aws.s3_bucket_exists(key): + set_selected_area(key) + return True, f'Selected upload area is {key}' else: return False, f'Upload area does not exist - {key}' else: diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py new file mode 100644 index 0000000..694cec2 --- /dev/null +++ b/ait/commons/util/command/submit.py @@ -0,0 +1,1300 @@ +import csv +import traceback + +import requests +import json +import pandas as pd +import numpy as np +from urllib.parse import urlparse + +from ait.commons.util.spreadsheet_util import SubmissionError +from ait.commons.util.user_profile import get_profile +from ait.commons.util.provider_api_util import ProviderApi + + +def matching_expression_alteration_and_cell_line(cell_line, expression_alteration): + return expression_alteration.expression_alteration_id.replace(" ", + "").strip() == cell_line.expression_alteration_id.replace( + " ", "").strip() + + +def get_entity_id_from_hal_link(url): + """ + Extracts and returns the ID from a given URL. + + Parameters: + url (str): The URL string. + + Returns: + str: The ID extracted from the URL. + """ + parsed_url = urlparse(url) + path_parts = parsed_url.path.split('/') + return path_parts[2] + + +def get_process_content(name): + process_data = { + "content": { + "type": name + } + } + + return process_data + + +def update_dataframe(input_df, created_entity_id, entity_id, raw_entity_rep_column_name): + """ + Updates the DataFrame with the new or modified cell line entity ID. + Returns: + - None + """ + entity_id_column_name = "Id" + + if entity_id_column_name not in input_df.columns: + input_df[entity_id_column_name] = np.nan + + input_df[entity_id_column_name] = input_df[entity_id_column_name].astype(object) + + input_df.loc[ + input_df[raw_entity_rep_column_name] == entity_id, + entity_id_column_name + ] = created_entity_id + + +def transform(file): + """ + Transforms the input file to a JSON object. + + Parameters: + file (str): The file path. + + Returns: + dict: The JSON object. + """ + if file.endswith('.tsv'): + json_data = [] + with open(file, 'r', newline='') as file: + reader = csv.DictReader(file, delimiter='\t') + for row in reader: + json_data.append(row) + return {'content': json_data} + + elif file.endswith('.csv'): + df = pd.read_csv(file) + return {'content': df.to_dict(orient='records')} + + else: + with open(file, 'r') as file: + return json.load(file) + + +def create_new_submission_envelope(url, access_token): + """ + Creates a new submission envelope. + + Parameters: + url (str): The URL to send the request to. + access_token (str): Access token for authorization. + + Returns: + tuple: A tuple containing the response data and the status code. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json={}) + status_code = response.status_code + + if status_code in {200, 201}: + response_data = response.json() + return response_data, status_code + + return None, status_code + + +def post_to_provider_api_and_get_entity_id(url, data, access_token): + """ + Sends a POST request to the specified URL and returns the entity ID from the response. + + Parameters: + url (str): The URL to send the request to. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. + + Returns: + str: The entity ID extracted from the response URL. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + entity_url = response_data['_links']['self']['href'] + + return get_entity_id_from_hal_link(entity_url) + + +def post_to_provider_api(url, data_type_in_hal_link, data, access_token): + """ + Sends a POST request to the specified URL. + + Parameters: + url (str): The URL to send the request to. + data_type_in_hal_link (str): The data type in the HAL link. + data (dict): The data to be sent in the POST request. + access_token (str): Access token for authorization. + + Returns: + str: The URL from the response. + """ + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + url = response_data['_links'][data_type_in_hal_link]['href'] + + return url + + +class CmdSubmit: + """ + A class to handle submission of studies, datasets, and biomaterials to a server. + + Attributes: + BASE_URL (str): The base URL for the server. + SUBMISSION_ENVELOPE_CREATE_URL (str): URL for creating submission envelopes. + SUBMISSION_ENVELOPE_BASE_URL (str): Base URL for submission envelopes. + args (Namespace): Command-line arguments. + access_token (str): Access token for authorization. + type (str): Type of submission (study, dataset, or biomaterial). + + Methods: + run(): Executes the submission process based on the type. + multi_type_submission(cell_lines, submission_envelope_id, access_token): Submits multiple cell lines. + typed_submission(type, file, access_token): Submits a single entity based on its type. + create_new_envelope_and_submit_entity(input_entity_type, data, access_token): Creates and submits a new entity. + use_existing_envelope_and_submit_entity(input_entity_type, data, submission_envelope_id, access_token): Submits an entity using an existing envelope. + link_dataset_to_study(dataset_id, study_id, access_token): Links a dataset to a study. + link_biomaterial_to_dataset(biomaterial_id, dataset_id, access_token): Links a biomaterial to a dataset. + link_biomaterial_to_process(biomaterial_id, process_id, access_token): Links a biomaterial to a process. + post_to_provider_api(url, data_type_in_hal_link, data, access_token): Sends a POST request to the provider API. + create_new_submission_envelope(url, access_token): Creates a new submission envelope. + perform_hal_linkage(url, input_id, link_this, link_to, access_token): Performs HAL linkage. + transform(file): Transforms the input file to a JSON object. + put_to_provider_api(url, access_token): Sends a PUT request to the provider API. + """ + BASE_URL = 'https://api.ingest.archive.morphic.bio/' + SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" + SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" + + def __init__(self, args): + """ + Initializes the CmdSubmit class with command-line arguments. + + Parameters: + args (Namespace): Command-line arguments. + """ + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.type = getattr(self.args, 'type', None) + self.file = getattr(self.args, 'file', None) + self.provider_api = ProviderApi(self.BASE_URL) + + def run(self): + """ + Executes the submission process based on the type of submission. + + Returns: + tuple: A tuple containing a boolean indicating success and the ID of the created entity. + """ + return self.typed_submission(self.type, self.file, self.access_token) + + def handle_cell_line(self, + cell_line, + expression_alterations, + cell_lines_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): + """ + Submits a cell line as a biomaterial entity to a specified submission envelope. + + Parameters: + - cell_line: The cell line object to be submitted. + - cell_lines_df: DataFrame containing information about cell lines. + - submission_envelope_id: ID of the submission envelope where the entity will be submitted. + - access_token: Access token for authentication and authorization. + - action: The action to be performed, either 'create' or 'modify'. + - errors: List to accumulate any error messages encountered. + + Returns: + - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. + """ + if action.lower() == 'modify': + try: + success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) + if success: + print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") + update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, + 'clonal_cell_line.label') + return cell_line.id + else: + errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: + errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + else: + try: + cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, + submission_envelope_id, dataset_id, access_token) + update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, + 'clonal_cell_line.label') + return cell_line_entity_id + except Exception as e: + errors.append(f"Failed to create cell line: {cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_cell_line_entity(self, + cell_line, + expression_alterations, + submission_envelope_id, + dataset_id, + access_token): + """ + Creates a new cell line entity and links it with a dataset and expression alterations. + + Parameters: + - cell_line: The cell line object to be created. + - expression_alterations: Any associated expression alterations. + - submission_envelope_id: ID of the submission envelope where the entity will be submitted. + - dataset_id: The dataset ID to link the cell line entity to. + - access_token: Access token for authentication and authorization. + + Returns: + - cell_line_entity_id: The ID of the newly created cell line entity. + """ + print(f"Creating Cell Line Biomaterial: {cell_line.biomaterial_id}") + + cell_line_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + cell_line.to_dict(), + submission_envelope_id, + access_token + ) + + if expression_alterations is not None: + self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, + expression_alterations) + + print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") + + self.link_to_dataset('biomaterial', dataset_id, cell_line_entity_id, access_token) + + return cell_line_entity_id + + def link_cell_line_with_expression_alterations(self, + access_token, + cell_line, + cell_line_entity_id, + expression_alterations): + for expression_alteration in expression_alterations: + if cell_line.expression_alteration_id is not None: + if matching_expression_alteration_and_cell_line(cell_line, expression_alteration): + print(f"Linking cell line {cell_line.biomaterial_id} " + f"as derived by process of {expression_alteration.expression_alteration_id}") + + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/derivedByProcesses", + expression_alteration.id, 'processes', access_token + ) + + def handle_differentiated_cell_line(self, + cell_line_entity_id, + differentiated_cell_line, + differentiated_cell_lines_df, + differentiated, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): + """ + Handles a single differentiated cell line associated with a given cell line. + + Parameters: + - cell_line: The main cell line object. + - cell_line_entity_id: Entity ID of the main cell line already submitted. + - differentiated_cell_line: The differentiated cell line object. + - differentiated_cell_lines_df: DataFrame containing information about differentiated cell lines. + - library_preparations_df: DataFrame containing information about library preparations. + - sequencing_file_df: DataFrame containing information about Sequence files. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ + if action.lower() == 'modify': + try: + success = self.patch_entity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), + access_token) + if success: + print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + + if differentiated: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'differentiated_product.label') + else: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'undifferentiated_product.label') + + return differentiated_cell_line.id + else: + errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: + errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + else: + try: + differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id) + + if differentiated: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'differentiated_product.label') + else: + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'undifferentiated_product.label') + return differentiated_cell_line_id + except Exception as e: + errors.append( + f"Failed to create differentiated/undifferentiated cell line: {differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_differentiated_cell_line_entity(self, + access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id): + """ + Creates a Differentiated Cell Line entity and links it to the submission envelope. + + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + dataset_id : str + The dataset ID to link with. + differentiated_cell_line : object + The differentiated cell line object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the created differentiated cell line entity. + """ + + # Create the differentiated cell line biomaterial + if cell_line_entity_id is not None: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) + + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") + + # Link the differentiated cell line entity to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id}") + differentiated_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + differentiated_cell_line.to_dict(), + submission_envelope_id, + access_token + ) + + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to dataset: {dataset_id}") + + # Link the differentiated cell line to the dataset + self.link_to_dataset('biomaterial', dataset_id, + differentiated_entity_id, access_token) + + return differentiated_entity_id + + def link_cell_line_and_differentiated_cell_line(self, + access_token, + cell_line, + differentiated_or_undifferentiated_cell_line, + dataset_id, + submission_envelope_id, + action, + errors): + """ + Creates and links the differentiation process between the original cell line and the differentiated cell line. + + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the differentiation process entity created. + """ + if action.lower() != 'modify': + cell_line_biomaterial_id = cell_line.biomaterial_id + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + + try: + + print( + f"Cell line {cell_line_biomaterial_id} has differentiated cell lines, creating differentiation process " + f"to link them") + + # Create a differentiation process entity + differentiation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id + ) + + print( + f"Linking Cell Line Biomaterial: {cell_line_biomaterial_id} as input to process : {differentiation_process_entity_id}") + + # Link the cell line entity as input to the differentiation process + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{cell_line.id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Differentiated cell line Biomaterial: " + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as derived by process : {differentiation_process_entity_id}") + + # Link the differentiated cell line entity as derived by the differentiation process + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}" + f"/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + return differentiation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Cell line {cell_line_biomaterial_id} " + f"and Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id}") + raise SubmissionError(errors, e) + + def handle_library_preparation(self, + differentiated_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): + """ + Handles a single library preparation associated with a given differentiated cell line. + + Parameters: + - differentiated_cell_line: The differentiated cell line object. + - differentiated_entity_id: Entity ID of the differentiated cell line already submitted. + - library_preparation: The library preparation object. + - library_preparations_df: DataFrame containing information about library preparations. + - sequencing_file_df: DataFrame containing information about sequencing files. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ + if action.lower() == 'modify': + try: + success = self.patch_entity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) + if success: + print(f"Updated library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + + update_dataframe(library_preparations_df, library_preparation.id, + library_preparation.biomaterial_id, + 'library_preparation.label') + return library_preparation.id + else: + errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: + errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + raise SubmissionError(errors, e) + else: + try: + library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, + differentiated_entity_id, + library_preparation, + submission_envelope_id) + update_dataframe(library_preparations_df, library_preparation_entity_id, + library_preparation.biomaterial_id, + 'library_preparation.label') + + return library_preparation_entity_id + except Exception as e: + errors.append(f"Failed to create library preparation biomaterial: {library_preparation.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_library_preparation_entity(self, + access_token, + dataset_id, + differentiated_entity_id, + library_preparation, + submission_envelope_id): + """ + Creates a Library Preparation entity for the Differentiated Cell Line and links it to the submission envelope and dataset. + + Parameters: + ----------- + access_token : str + The authentication token. + dataset_id : str + The dataset ID to link with. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation : object + The library preparation object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the created library preparation entity. + """ + if differentiated_entity_id is not None: + print( + f"Creating Library Preparation as child of Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + + # Create the library preparation biomaterial + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) + + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to envelope: {submission_envelope_id}") + + # Link the library preparation to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Library preparation Biomaterial: {library_preparation.biomaterial_id}") + library_preparation_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + library_preparation.to_dict(), + submission_envelope_id, + access_token + ) + + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to dataset: {dataset_id}") + + # Link the library preparation to the dataset + self.link_to_dataset('biomaterial', dataset_id, library_preparation_entity_id, access_token) + + return library_preparation_entity_id + + def link_differentiated_and_library_preparation(self, + access_token, + differentiated_or_undifferentiated_cell_line, + library_preparation, + dataset_id, + submission_envelope_id, + action, + errors): + """ + Links the Differentiated Cell Line to the Library Preparation through a library preparation process. + + Parameters: + ----------- + access_token : str + The authentication token. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation_entity_id : str + The ID of the library preparation entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the library preparation process entity created. + """ + if action.lower() != 'modify': + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + library_preparation_biomaterial_id = library_preparation.biomaterial_id + + try: + print(f"Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"has library preparations, creating library " + f"preparation process to link them") + + # Create a library preparation process entity + library_preparation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as input to library preparation process") + + # Link the differentiated cell line entity as input to the library preparation process + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as derived by library preparation process") + + # Link the library preparation entity as derived by the library preparation process + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + return library_preparation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Differentiated Cell line " + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} and Library preparation" + f" {library_preparation_biomaterial_id}") + raise SubmissionError(errors, e) + + def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, + sequencing_file_df, submission_envelope_id, dataset_id, + access_token, action, errors): + """ + Handles a single sequencing file associated with a given library preparation. + + Parameters: + - library_preparation: The library preparation object. + - library_preparation_entity_id: Entity ID of the library preparation already submitted. + - sequencing_file: The sequencing file object. + - sequencing_file_df: DataFrame containing information about sequencing files. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + """ + if action.lower() == 'modify': + try: + success = self.patch_entity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) + + if success: + print(f"Updated sequencing file: {sequencing_file.id} / " + f"{sequencing_file.file_name}") + + update_dataframe(sequencing_file_df, sequencing_file.id, + sequencing_file.file_name, + 'sequence_file.label') + return sequencing_file.id + else: + errors.append( + f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + raise SubmissionError(errors) + except Exception as e: + errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + raise SubmissionError(errors, e) + else: + try: + sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, + dataset_id, + library_preparation_entity_id, + sequencing_file, + submission_envelope_id) + update_dataframe(sequencing_file_df, sequencing_file_entity_id, + sequencing_file.file_name, + 'sequence_file.label') + + return sequencing_file_entity_id + except Exception as e: + errors.append(f"Failed to create Sequencing file: {sequencing_file.file_name}") + raise SubmissionError(errors, e) + + def create_sequencing_file_entity(self, access_token, dataset_id, library_preparation_entity_id, sequencing_file, + submission_envelope_id): + """ + Creates a Sequencing File entity for the Library Preparation and links it to the submission envelope and dataset. + + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file : object + The sequencing file object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. + dataset_id : str + The dataset ID to link with. + access_token : str + The authentication token. + + Returns: + -------- + str + The ID of the created sequencing file entity. + """ + + print( + f"Creating Sequencing file: {sequencing_file.file_name} as a result of sequencing the Library preparation " + f"biomaterial: {library_preparation_entity_id}") + + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) + + print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") + + self.link_to_dataset('file', dataset_id, sequencing_file_entity_id, access_token) + + return sequencing_file_entity_id + + def link_library_preparation_and_sequencing_file(self, + access_token, + library_preparation, + sequencing_file, + dataset_id, + submission_envelope_id, + action, + errors): + """ + Links the Library Preparation to the Sequencing File through a sequencing process. + + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file_entity_id : str + The ID of the sequencing file entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + access_token : str + The authentication token. + + Returns: + -------- + str + The ID of the sequencing process entity created. + """ + if action.lower() != 'modify': + library_preparation_biomaterial_id = library_preparation.biomaterial_id + sequence_file_name = sequencing_file.file_name + + try: + print(f"Library preparation {library_preparation_biomaterial_id} has " + f"generated sequencing files. Creating sequencing process to link the sequencing file") + + # Create a sequencing process entity + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) + + print( + f"Linking Library preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as input to process: {sequencing_process_entity_id}") + + # Link the library preparation entity as input to the sequencing process + self.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Sequencing file: {sequence_file_name} as derived by process: " + f"{sequencing_process_entity_id}") + + # Link the sequencing file entity as derived by the sequencing process + self.perform_hal_linkage( + f"{self.BASE_URL}/files/{sequencing_file.id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + return sequencing_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Library Preparation " + f"{library_preparation_biomaterial_id} and Sequencing file" + f" {sequence_file_name}") + raise SubmissionError(errors, e) + + def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): + process_entity_id = self.use_existing_envelope_and_submit_entity( + 'process', + process_data, + submission_envelope_id, + access_token + ) + + print( + f"Linking process: {process_entity_id} " + f"to dataset: {dataset_id}") + self.link_to_dataset('process', dataset_id, process_entity_id, access_token) + + return process_entity_id + + def establish_links(self, + cell_lines, + cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, + library_preparations, + library_preparations_df, + sequencing_files, + sequencing_files_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): + """ + Handles the submission of multiple types of biomaterials (cell lines, + differentiated cell lines, library preparations) + to a specified submission envelope. + + Parameters: + - cell_lines: List of cell line objects to be submitted. + - cell_lines_df: DataFrame for tracking cell line entity IDs. + - differentiated_cell_lines_df: DataFrame for tracking differentiated cell line entity IDs. + - library_preparations_df: DataFrame for tracking library preparation entity IDs. + - sequencing_file_df: DataFrame for tracking sequencing file entity IDs. + - submission_envelope_id: ID of the submission envelope where entities will be linked. + - access_token: Access token for authentication and authorization. + + Returns: + - Tuple containing updated DataFrames and a status message. + """ + try: + for cell_line in cell_lines: + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: + if cell_line.biomaterial_id == differentiated_or_undifferentiated_cell_line.cell_line_biomaterial_id: + self.link_cell_line_and_differentiated_cell_line(access_token, + cell_line, + differentiated_or_undifferentiated_cell_line, + dataset_id, + submission_envelope_id, + action, + errors) + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: + for library_preparation in library_preparations: + if (differentiated_or_undifferentiated_cell_line.biomaterial_id == + library_preparation.differentiated_biomaterial_id): + self.link_differentiated_and_library_preparation( + access_token, + differentiated_or_undifferentiated_cell_line, + library_preparation, + dataset_id, + submission_envelope_id, + action, + errors) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if library_preparation.biomaterial_id == sequencing_file.library_preparation_id: + self.link_library_preparation_and_sequencing_file(access_token, + library_preparation, + sequencing_file, + dataset_id, + submission_envelope_id, + action, + errors) + + message = 'SUCCESS' + except Exception as e: + message = f"An error occurred: {str(e)}" + errors.append(message) + raise SubmissionError(message, e) + # Set DataFrames to None in case of an error + # cell_lines_df = None + # differentiated_cell_lines_df = None + # library_preparations_df = None + # sequencing_files_df = None + + return ([cell_lines_df, + differentiated_or_undifferentiated_cell_lines_df, + library_preparations_df, + sequencing_files_df], message) + + def typed_submission(self, type, file, access_token): + """ + Submits a single entity based on its type. + + Parameters: + type (str): The type of entity to be submitted ('study', 'dataset', 'biomaterial', 'process'). + file (str): The file containing the data to be submitted. + access_token (str): Access token for authorization. + + Returns: + tuple: A tuple containing a boolean indicating success and the ID of the created entity. + """ + if type in ['study', 'dataset', 'biomaterial', 'process', 'file']: + data = transform(file) if file is not None else {} + + entity_id = self.create_new_envelope_and_submit_entity(type, data, access_token) + + if entity_id: + if type == 'dataset': + if self.args.study is not None: + study_id = self.args.study + self.link_dataset_to_study(entity_id, study_id, access_token) + else: + link_to_study = input("Do you want to link this dataset to a study? " + "(yes/no): ").lower() + if link_to_study == 'yes': + study_id = input("Input study id: ").lower() + self.link_dataset_to_study(entity_id, study_id, access_token) + elif type == 'biomaterial': + if self.args.dataset is not None: + dataset_id = self.args.dataset + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + else: + link_to_dataset = input("Do you want to link this biomaterial to a " + "dataset? (yes/no): ").lower() + if link_to_dataset == 'yes': + dataset_id = input("Input dataset id: ").lower() + + self.link_biomaterial_to_dataset(entity_id, dataset_id, access_token) + + # Linking biomaterial to process + if self.args.process is not None: + process_id = self.args.process + + self.link_biomaterial_to_process(entity_id, process_id, access_token) + return True, entity_id + else: + print("Unsupported type") + return False, "Unsupported type" + + def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_token): + """ + Creates and submits a new entity (study, dataset, biomaterial, or process) and returns its ID. + + Parameters: + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + data (dict): The data to be submitted. + access_token (str): Access token for authorization. + + Returns: + str: The ID of the created entity. + """ + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes' + } + + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return None + + entity_create_url = post_to_provider_api(self.SUBMISSION_ENVELOPE_CREATE_URL, hal_entity, data, + access_token) + entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) + + print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") + + return entity_id + + def patch_entity(self, input_entity_type, id, data, access_token): + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return False + + entity_patch_url = f"{self.BASE_URL}/{hal_entity}/{id}" + return self.patch_to_provider_api(entity_patch_url, data, access_token) + + def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token): + entity_map = { + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return False + + put_url = f"{self.BASE_URL}/datasets/{dataset_id}/{hal_entity}/{entity_id}" + return self.provider_api.put(put_url, access_token) + + def patch_to_provider_api(self, entity_patch_url, data, access_token): + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.patch(entity_patch_url, headers=headers, json=data) + return response.status_code // 100 == 2 + + def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submission_envelope_id, access_token): + """ + Submits an entity using an existing submission envelope and returns its ID. + + Parameters: + input_entity_type (str): The type of entity to create ('study', 'dataset', 'biomaterial', 'process'). + data (dict): The data to be submitted. + submission_envelope_id (str): ID of the submission envelope. + access_token (str): Access token for authorization. + + Returns: + str: The ID of the created entity. + """ + entity_map = { + 'study': 'studies', + 'dataset': 'datasets', + 'biomaterial': 'biomaterials', + 'process': 'processes', + 'file': 'files' + } + hal_entity = entity_map.get(input_entity_type) + + if not hal_entity: + return None + + entity_create_url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/{hal_entity}" + entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) + + print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") + + return entity_id + + def link_dataset_to_study(self, dataset_id, study_id, access_token): + """ + Links a dataset to a study. + + Parameters: + dataset_id (str): The ID of the dataset. + study_id (str): The ID of the study. + access_token (str): Access token for authorization. + """ + print(f"Linking dataset {dataset_id} to study {study_id}") + + url = f"{self.BASE_URL}/studies/{study_id}/datasets/{dataset_id}" + self.provider_api.put(url, access_token) + + print(f"Dataset linked successfully to study: {study_id}") + + def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): + """ + Links a biomaterial to a dataset. + + Parameters: + biomaterial_id (str): The ID of the biomaterial. + dataset_id (str): The ID of the dataset. + access_token (str): Access token for authorization. + """ + print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") + + url = f"{self.BASE_URL}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" + self.provider_api.put(url, access_token) + + print(f"Biomaterial linked successfully to dataset: {dataset_id}") + + def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): + """ + Links a biomaterial to a process. + + Parameters: + biomaterial_id (str): The ID of the biomaterial. + process_id (str): The ID of the process. + access_token (str): Access token for authorization. + """ + print(f"Linking biomaterial {biomaterial_id} to process {process_id}") + + url = f"{self.BASE_URL}/biomaterials/{biomaterial_id}/inputToProcesses" + self.perform_hal_linkage(url, process_id, 'processes', access_token) + + def delete_submission(self, submission_envelope_id, access_token, force_delete=False): + """ + Sends a DELETE request to delete a submission envelope. + + Parameters: + submission_envelope_id (str): ID of the submission envelope to delete. + access_token (str): Access token for authorization. + force_delete (bool): Whether to force delete the submission envelope (default: False). + + Returns: + bool: True if the deletion was successful, False otherwise. + """ + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}" + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + params = {'force': str(force_delete).lower()} + + response = requests.delete(url, headers=headers, params=params) + + return response.status_code // 100 == 2 + + def perform_hal_linkage(self, url, input_id, link_to, access_token): + """ + Performs HAL linkage by sending a POST request. + + Parameters: + url (str): The URL to send the request to. + input_id (str): The ID of the input entity. + link_to (str): The entity to link to. + access_token (str): Access token for authorization. + + Raises: + Exception: If the linkage fails. + """ + headers = { + 'Content-Type': 'text/uri-list', + 'Authorization': f'Bearer {access_token}' + } + + response = requests.post(url, headers=headers, data=f"{self.BASE_URL}/{link_to}/{input_id}") + + if response.status_code != 200: + raise Exception(f"Failed to link biomaterial to process {input_id}. " + f"Status code: {response.status_code}, Response: {response.text}") + else: + print("Linkage successful") + + def create_child_biomaterial(self, cell_line_entity_id, body, access_token): + url = f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/childBiomaterials" + + entity_id = post_to_provider_api_and_get_entity_id(url, body, access_token) + return entity_id + + def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, access_token): + """ + Links an entity to a submission envelope. + + Parameters: + type (str): The type of the entity (e.g., 'biomaterial', 'file'). + entity_id (str): The ID of the entity to link. + submission_envelope_id (str): The ID of the submission envelope. + access_token (str): Access token for authorization. + """ + if type == 'biomaterial': + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/biomaterials/{entity_id}" + self.provider_api.put(url, access_token) + elif type == 'file': + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/files/{entity_id}" + self.provider_api.put(url, access_token) + + def delete_dataset(self, dataset, access_token): + """ + Deletes a dataset along with its associated biomaterials, processes, and data files. + + Parameters: + dataset (str): The ID of the dataset to delete. + access_token (str): Access token for authorization. + """ + fetched_dataset = self.provider_api.get(f"{self.BASE_URL}/datasets/{dataset}", access_token) + print(f"Dataset fetched successfully: {dataset}") + print(f"Initiating delete of {dataset}") + + biomaterials = fetched_dataset.get('biomaterials', []) + processes = fetched_dataset.get('processes', []) + data_files = fetched_dataset.get('dataFiles', []) + + print("Deleting Biomaterials:") + for biomaterial in biomaterials: + print(f"Deleting {biomaterial}") + self.provider_api.delete(f"{self.BASE_URL}/biomaterials/{biomaterial}", access_token) + + print("\nDeleting Processes:") + for process in processes: + print(f"Deleting {process}") + self.provider_api.delete(f"{self.BASE_URL}/processes/{process}", access_token) + + print("\nDeleting Data Files:") + for data_file in data_files: + print(f"Deleting {data_file}") + self.provider_api.delete(f"{self.BASE_URL}/files/{data_file}", access_token) + + # print(f"\nDeleting the dataset: {dataset}") + # self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py new file mode 100644 index 0000000..dfb8e40 --- /dev/null +++ b/ait/commons/util/command/submit_file.py @@ -0,0 +1,697 @@ +# Import necessary modules/classes from ait.commons.util package +import os +import sys +from datetime import datetime + +import numpy as np +import pandas as pd +from ait.commons.util.aws_client import Aws +from ait.commons.util.command.list import CmdList +from ait.commons.util.command.submit import CmdSubmit, get_entity_id_from_hal_link, create_new_submission_envelope +from ait.commons.util.command.upload import CmdUpload +from ait.commons.util.user_profile import get_profile +from ait.commons.util.provider_api_util import ProviderApi +from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ + merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ + merge_differentiated_cell_line_and_library_preparation, SubmissionError + + +# Define a class for handling submission of a command file +def validate_sequencing_files(sequencing_files, + list_of_files_in_upload_area, + dataset, + errors): + for sequencing_file in sequencing_files: + match_found = False # Flag to indicate if a match is found + + for file_key in list_of_files_in_upload_area: + if sequencing_file.file_name == file_key: + match_found = True + break # Exit the inner loop if a match is found + + if not match_found: + errors.append( + f"No matching file found for sequencing file: {sequencing_file.file_name} " + f"in the upload area for the dataset: {dataset}" + ) + + +def get_content(unique_value): + return {"content": unique_value} + + +def _create_expression_alterations(submission_instance, + submission_envelope_id, + access_token, + expression_alterations, + expression_alterations_df): + expression_alterations_entity_id_column_name = "Id" + + if expression_alterations_entity_id_column_name not in expression_alterations_df.columns: + expression_alterations_df[expression_alterations_entity_id_column_name] = np.nan + + for expression_alteration in expression_alterations: + # Submit the expression alteration and retrieve the ID + expression_alteration_id = submission_instance.use_existing_envelope_and_submit_entity( + 'process', + expression_alteration.to_dict(), # Convert the object to a dictionary for submission + submission_envelope_id, + access_token + ) + # Set the retrieved ID in the ExpressionAlterationStrategy object + expression_alteration.id = expression_alteration_id + expression_alterations_df[expression_alterations_entity_id_column_name] = ( + expression_alterations_df[expression_alterations_entity_id_column_name] + .astype(object)) + expression_alterations_df.loc[ + expression_alterations_df[ + 'expression_alteration.label'] == expression_alteration.expression_alteration_id, + expression_alterations_entity_id_column_name + ] = expression_alteration_id + + return expression_alterations + + +class CmdSubmitFile: + BASE_URL = 'https://api.ingest.archive.morphic.bio/' + SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" + SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" + + def __init__(self, args): + """ + Initialize CmdSubmitFile instance. + + Args: + args: Command-line arguments passed to the script. + """ + self.args = args + self.user_profile = get_profile('morphic-util') + self.access_token = self.user_profile.access_token + self.aws = Aws(self.user_profile) + self.provider_api = ProviderApi(self.BASE_URL) + self.validation_errors = [] + self.submission_errors = [] + self.submission_envelope_id = None + + # Assign and validate required arguments + self.action = self._get_required_arg('action', "Submission action (ADD, MODIFY or DELETE) is mandatory") + self.dataset = self._get_required_arg('dataset', ( + "Dataset is mandatory to be registered before submitting dataset metadata. " + "Please submit your study using the submit option, register your dataset using " + "the submit option, and link your dataset to your study before proceeding with this submission." + )) + + if self.dataset: + try: + self.provider_api.get(f"{self.BASE_URL}/datasets/{self.dataset}", + self.access_token) + except Exception as e: + print(f"Dataset does not exist {self.dataset}") + sys.exit(1) + + # Validate file argument only if action is not DELETE + if self.action != 'DELETE': + self.file = self._get_required_arg('file', "File is mandatory") + else: + print(f"Deleting dataset {self.dataset}") + + def _get_required_arg(self, attr_name, error_message): + """ + Helper function to get a required argument and print an error message if it's missing. + + Args: + attr_name (str): The name of the attribute to check in self.args. + error_message (str): The error message to print if the attribute is missing. + + Returns: + The value of the attribute if it exists, otherwise None. + """ + value = getattr(self.args, attr_name, None) + if value is None: + print(error_message) + sys.exit(1) + return value + + def run(self): + """ + Execute the command file submission process. + """ + submission_instance = CmdSubmit(self) + + try: + if self._is_delete_action(): + return self._handle_delete(submission_instance) + + list_of_files_in_upload_area = self._list_files_in_upload_area() + + if self.file: + try: + self._process_submission(submission_instance, list_of_files_in_upload_area) + return True, "SUBMISSION IS SUCCESSFUL." + except Exception as e: + return self._delete_actions(self.submission_envelope_id, submission_instance, e) + except KeyboardInterrupt: + # Handle the interruption and exit gracefully + print("\nProcess interrupted by user. Exiting gracefully...") + self._delete_actions(self.submission_envelope_id, submission_instance, None) + sys.exit(0) # Exit with a zero status code indicating a clean exit + except Exception as e: + # Handle any other unexpected exceptions + print(f"An unexpected error occurred: {str(e)}") + self._delete_actions(self.submission_envelope_id, submission_instance, None) + sys.exit(1) # Exit with a non-zero status code indicating an error + + def _is_delete_action(self): + """Check if the current action is 'DELETE'.""" + return self.action.lower() == 'delete' + + def _handle_delete(self, submission_instance): + """Handle the deletion of a dataset.""" + self.file = None + submission_instance.delete_dataset(self.dataset, self.access_token) + return True, None + + def _list_files_in_upload_area(self): + """List files in the upload area.""" + list_instance = CmdList(self.aws, self.args) + return list_instance.list_bucket_contents_and_return(self.dataset, '') + + def _process_submission(self, submission_instance, list_of_files_in_upload_area): + try: + """Process the file submission.""" + parser = SpreadsheetSubmitter(self.file) + parsed_data = self._parse_spreadsheet(parser) + self._validate_and_upload(parsed_data, list_of_files_in_upload_area) + + # Extract parsed data + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] + parent_cell_line_names = parsed_data['parent_cell_line_names'] + cell_lines = parsed_data['cell_lines'] + cell_lines_df = parsed_data['cell_lines_df'] + differentiated_cell_lines = parsed_data['differentiated_cell_lines'] + differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + undifferentiated_cell_lines = parsed_data['undifferentiated_cell_lines'] + undifferentiated_cell_lines_df = parsed_data['undifferentiated_cell_lines_df'] + library_preparations = parsed_data['library_preparations'] + library_preparations_df = parsed_data['library_preparations_df'] + sequencing_files = parsed_data['sequencing_files'] + sequencing_files_df = parsed_data['sequencing_files_df'] + differentiated = parsed_data['differentiated'] + cell_line_sheet_name = parsed_data['cell_line_sheet_name'] + + if differentiated: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'differentiated_cell_line_sheet_name'] + else: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'undifferentiated_cell_line_sheet_name'] + + # Initialize lists for created entities + created_expression_alterations = [] + created_cell_lines = [] + created_differentiated_or_undifferentiated_cell_lines = [] + created_library_preparations = [] + created_sequencing_files = [] + + if self._is_add_action(): + self._create_submission_envelope() + + if cell_lines and cell_lines_df is not None: + if self._is_add_action(): + created_expression_alterations = self._handle_expression_alterations( + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_names, + cell_lines + ) + + created_cell_lines = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df, created_expression_alterations) + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df, differentiated) + + if (undifferentiated_cell_lines and undifferentiated_cell_lines_df is not None + and not differentiated): + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, undifferentiated_cell_lines, undifferentiated_cell_lines_df, differentiated) + + if library_preparations and library_preparations_df is not None: + created_library_preparations = self._create_library_preparations( + submission_instance, library_preparations, library_preparations_df) + + if sequencing_files and sequencing_files_df is not None: + created_sequencing_files = self._create_sequencing_files( + submission_instance, sequencing_files, sequencing_files_df) + + updated_dfs, message = self._establish_links(submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_or_undifferentiated_cell_lines, + differentiated_cell_lines_df if differentiated_cell_lines_df is not None else undifferentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, + expression_alterations_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name) + else: + return self._delete_actions(self.submission_envelope_id, + submission_instance, + None) + except ValidationError as e: + print(f"Validation Error: {e.errors}") + # self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except SubmissionError as e: + print(f"Submission Error: {e.errors}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except Exception as e: + print(f"An unexpected error occurred during submission processing: {e}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + raise e # Re-raise the exception to propagate it upwards + + def _handle_parent_cell_line(self, submission_instance, parent_cell_line_name): + """Handles the creation of a parent cell line.""" + parent_cell_line_id = None + + if parent_cell_line_name: + print(f"Creating parental cell line with name {parent_cell_line_name}") + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + + return parent_cell_line_id + + def _handle_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_names, + cell_lines): + """Handles the creation of expression alterations and links them to the parent cell line if needed.""" + created_expression_alterations = [] + + if expression_alterations and expression_alterations_df is not None: + created_expression_alterations = self._submit_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df + ) + + if created_expression_alterations: + for parent_cell_line_name in parent_cell_line_names: + self._link_parent_cell_line_expression_alteration( + submission_instance, + self.access_token, + parent_cell_line_name, + cell_lines, + created_expression_alterations + ) + + return created_expression_alterations + + def _parse_spreadsheet(self, parser): + try: + # Determine the necessary sheet names + tab_names = parser.list_sheets() + + cell_line_sheet_name = next( + (name for name in ["Cell line", "Clonal cell line"] if name in tab_names), None + ) + + differentiated_cell_line_sheet_name = next( + (name for name in ["Differentiated cell line", "Differentiated product"] if name in tab_names), None + ) + + undifferentiated_cell_line_sheet_name = ( + "Undifferentiated product" if "Undifferentiated product" in tab_names else None + ) + + undifferentiated_cell_lines = [] + undifferentiated_cell_lines_df = None + + differentiated_cell_lines = [] + differentiated_cell_lines_df = None + + differentiated = False + + # Validate the presence of required sheets + if not cell_line_sheet_name: + self.validation_errors.append("Spreadsheet must contain a " + "'Cell line' or 'Clonal cell line' sheet.") + + if not (differentiated_cell_line_sheet_name or undifferentiated_cell_line_sheet_name): + self.validation_errors.append( + "Spreadsheet must contain a " + "'Differentiated cell line', 'Undifferentiated product', " + "or 'Differentiated product' sheet." + ) + + # Parse different sections of the spreadsheet + expression_alterations, expression_alterations_df = parser.get_expression_alterations( + 'Expression alteration', self.action, self.validation_errors + ) + + cell_lines, cell_lines_df, parent_cell_line_names = parser.get_cell_lines( + cell_line_sheet_name, self.action, self.validation_errors + ) + + if differentiated_cell_line_sheet_name: + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + differentiated_cell_line_sheet_name, self.action, self.validation_errors + ) + + if undifferentiated_cell_line_sheet_name: + undifferentiated_cell_lines, undifferentiated_cell_lines_df = parser.get_undifferentiated_cell_lines( + undifferentiated_cell_line_sheet_name, self.action, self.validation_errors + ) + + # Check for errors and merge data + if differentiated_cell_lines and undifferentiated_cell_lines: + self.validation_errors.append( + "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/ products" + ) + + if differentiated_cell_lines: + differentiated = True + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, + self.validation_errors) + + if undifferentiated_cell_lines and not differentiated: + merge_cell_line_and_differentiated_cell_line(cell_lines, undifferentiated_cell_lines, + self.validation_errors) + + library_preparations, library_preparations_df = parser.get_library_preparations( + 'Library preparation', differentiated, self.action, self.validation_errors + ) + + if differentiated_cell_lines: + merge_differentiated_cell_line_and_library_preparation( + differentiated_cell_lines, library_preparations, self.validation_errors + ) + + if undifferentiated_cell_lines and not differentiated: + merge_differentiated_cell_line_and_library_preparation( + undifferentiated_cell_lines, library_preparations, self.validation_errors + ) + + sequencing_files, sequencing_files_df = parser.get_sequencing_files( + 'Sequence file', self.action, self.validation_errors + ) + + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + + # Return the parsed data as a dictionary + return { + "expression_alterations": expression_alterations, + "expression_alterations_df": expression_alterations_df, + "cell_lines": cell_lines, + "cell_lines_df": cell_lines_df, + "parent_cell_line_names": parent_cell_line_names, + "differentiated_cell_lines": differentiated_cell_lines, + "differentiated_cell_lines_df": differentiated_cell_lines_df, + "undifferentiated_cell_lines": undifferentiated_cell_lines, + "undifferentiated_cell_lines_df": undifferentiated_cell_lines_df, + "library_preparations": library_preparations, + "library_preparations_df": library_preparations_df, + "sequencing_files": sequencing_files, + "sequencing_files_df": sequencing_files_df, + "differentiated": differentiated, + "cell_line_sheet_name": cell_line_sheet_name, + "differentiated_cell_line_sheet_name": differentiated_cell_line_sheet_name, + "undifferentiated_cell_line_sheet_name": undifferentiated_cell_line_sheet_name + } + except Exception as e: + print(f"Exception occurred:", e) + + self.validation_errors.append(f"Spreadsheet is invalid {self.file}") + return None + + def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): + # Validate the parsed data and upload the file. + """ + validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, + self.validation_errors) + """ + """ + Handle validation errors, including interacting with the user in case of a missing sheet. + """ + try: + # Exit now if there are validation errors in the spreadsheet + if self.validation_errors: + raise ValidationError(self.validation_errors) + except ValidationError: + # Check if the error is related to a missing sheet + missing_sheet_errors = [msg for msg in self.validation_errors if "Missing sheet" in msg] + + if missing_sheet_errors: + # Extract the sheet name(s) from the errors + missing_sheets = ', '.join([msg.split("'")[1] for msg in missing_sheet_errors]) + # Ask the user whether to proceed + """ + user_response = input( + f"A required sheet '{missing_sheets}' is missing. Do you want to proceed anyway? (yes/no): ").strip().lower() + if user_response == 'yes': + print("Proceeding with execution...") + else: + """ + print("Execution terminated due to missing required sheet.") + sys.exit(1) + else: + # Print the error message + # print(f"Validation Error: {e.errors}") + # Exit the program with a non-zero status code to indicate an error + # sys.exit(1) + raise ValidationError(self.validation_errors) + + print(f"File {self.file} is validated successfully. Initiating submission") + print(f"File {self.file} being uploaded to storage") + + upload_instance = CmdUpload(self.aws, self.args) + upload_instance.upload_file(self.dataset, self.file, os.path.basename(self.file)) + + def _is_add_action(self): + """Check if the current action is 'ADD'.""" + return self.action.lower() == 'add' + + def _is_modify_action(self): + """Check if the current action is 'MODIFY'.""" + return self.action.lower() == 'modify' + + def _create_submission_envelope(self): + """Create a new submission envelope.""" + submission_envelope_response, status_code = create_new_submission_envelope( + self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token + ) + if status_code in (200, 201): + self.submission_envelope_id = get_entity_id_from_hal_link( + submission_envelope_response['_links']['self']['href']) + print(f"Submission envelope for this submission is: {self.submission_envelope_id}") + else: + raise SubmissionError(f"Failed to create submission envelope. Status code: {status_code}") + + def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): + """Submit the parent cell line.""" + return submission_instance.use_existing_envelope_and_submit_entity( + 'biomaterial', get_content(parent_cell_line_name), + self.submission_envelope_id, self.access_token + ) + + def _submit_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df): + """Submit expression alterations.""" + return _create_expression_alterations( + submission_instance, self.submission_envelope_id, self.access_token, + expression_alterations, expression_alterations_df + ) + + def _create_cell_lines(self, + submission_instance, + cell_lines, + cell_lines_df, + expression_alterations): + for cell_line in cell_lines: + cell_line_entity_id = submission_instance.handle_cell_line(cell_line, expression_alterations, cell_lines_df, + self.submission_envelope_id, self.dataset, + self.access_token, self.action, + self.submission_errors) + cell_line.id = cell_line_entity_id + + return cell_lines + + def _create_differentiated_cell_lines(self, + submission_instance, + differentiated_cell_lines, + differentiated_cell_lines_df, + differentiated): + for differentiated_cell_line in differentiated_cell_lines: + differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, + differentiated_cell_line, + differentiated_cell_lines_df, + differentiated, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + differentiated_cell_line.id = differentiated_cell_line_entity_id + + return differentiated_cell_lines + + def _create_library_preparations(self, + submission_instance, + library_preparations, + library_preparations_df): + for library_preparation in library_preparations: + library_preparation_entity_id = submission_instance.handle_library_preparation(None, + library_preparation, + library_preparations_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + library_preparation.id = library_preparation_entity_id + + return library_preparations + + def _create_sequencing_files(self, + submission_instance, + sequencing_files, + sequencing_files_df): + for sequencing_file in sequencing_files: + sequencing_file_entity_id = submission_instance.handle_sequencing_file(None, + sequencing_file, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + sequencing_file.id = sequencing_file_entity_id + + return sequencing_files + + def _establish_links(self, + submission_instance, + created_cell_lines, + cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df): + """Perform the main submission.""" + # Unpack the returned values into a list and the message separately + updated_dfs, message = submission_instance.establish_links( + created_cell_lines, + cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors + ) + + return updated_dfs, message + + def _save_and_upload_results(self, + updated_dfs, + expression_alteration_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name): + """Save the updated dataframes and upload the results.""" + current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + output_file = f"submission_result_{current_time}.xlsx" + try: + # List of updated DataFrames and corresponding sheet names + dataframes = [ + (updated_dfs[0], cell_line_sheet_name), + (updated_dfs[1], differentiated_or_undifferentiated_cell_line_sheet_name), + (updated_dfs[2], 'Library preparation'), + (updated_dfs[3], 'Sequence file'), + (expression_alteration_df, 'Expression alteration strategy') + ] + + # Create the Excel file and write only non-null DataFrames + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + for df, sheet_name in dataframes: + if df is not None: # Check if the DataFrame is not None + df.to_excel(writer, sheet_name=sheet_name, index=False) + if os.path.exists(output_file): + CmdUpload(self.aws, self.args).upload_file(self.dataset, output_file, os.path.basename(output_file)) + print(f"File {output_file} uploaded successfully.") + else: + raise FileNotFoundError(f"The output file {output_file} was not created or cannot be found.") + except Exception as e: + print(f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for tracing metadata") + + def _delete_actions(self, submission_envelope_id, submission_instance, error=None): + """Handle actions needed when a submission fails.""" + try: + if self._is_add_action(): + self._handle_add_action_failure(submission_envelope_id, submission_instance, error) + elif self._is_modify_action(): + self._handle_modify_action_failure(error) + except Exception as e: + print(f"Failed to rollback submission {submission_envelope_id}: {str(e)}") + + def _handle_add_action_failure(self, submission_envelope_id, submission_instance, error): + """Handle failure during 'ADD' action.""" + print("SUBMISSION has failed, rolling back") + print("SUBMISSION ERRORS are listed below. Any metadata created will be deleted now, please wait until " + "the clean-up finishes") + print("\n".join(self.submission_errors)) + + submission_instance.delete_submission(submission_envelope_id, self.access_token, True) + submission_instance.delete_dataset(self.dataset, self.access_token) + + if error: + return False, f"An error occurred: {str(error)}" + else: + return False, "Submission has failed, rolled back" + + def _handle_modify_action_failure(self, error): + """Handle failure during 'MODIFY' action.""" + print("SUBMISSION has failed, contact the support team for next actions") + print("SUBMISSION ERRORS are listed below.") + print("\n".join(self.submission_errors)) + + if error: + return False, f"An error occurred: {str(error)}" + else: + return False, "Submission has failed, rolled back" + + def _link_parent_cell_line_expression_alteration(self, + submission_instance, + access_token, + parent_cell_line_name, + cell_lines, + created_expression_alterations): + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + + for cell_line in cell_lines: + if cell_line.parental_cell_line_name == parent_cell_line_name: + for expression_alteration in created_expression_alterations: + if cell_line.expression_alteration_id == expression_alteration.expression_alteration_id: + print(f"Expression alteration match found, Linking parent cell line {parent_cell_line_name} " + f"as input to process of {expression_alteration.expression_alteration_id}") + submission_instance.perform_hal_linkage( + f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses", + expression_alteration.id, 'processes', access_token + ) diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index d2bcbe3..5285522 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -1,3 +1,4 @@ +import hashlib import os import filetype @@ -9,6 +10,16 @@ from ait.commons.util.progress_bar import ProgressBar +def compute_md5(file_path): + """Compute the MD5 hash of the file.""" + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + class CmdUpload: """ admin and user @@ -20,12 +31,15 @@ def __init__(self, aws, args): self.args = args self.files = [] - def upload_file(self, data_file, key): + def upload_file(self, selected_area, data_file, destination_file): + hash_md5 = compute_md5(data_file) + print(f"MD5 hash of {data_file} is {hash_md5}") + overwrite = getattr(self.args, 'o', False) file_size = os.path.getsize(data_file) - if not self.args.o and self.aws.obj_exists(key): - print(f"{data_file} already exists. Use -o to overwrite.") + if not overwrite and self.aws.data_file_exists(selected_area, destination_file): + print(f"{destination_file} already exists. Use -o to overwrite.") elif file_size == 0: print(f"{data_file} is an empty file") @@ -41,26 +55,29 @@ def upload_file(self, data_file, key): content_type = file_type.mime content_type += '; dcp-type=data' - s3.Bucket(self.aws.bucket_name).upload_file(Filename=data_file, - Key=key, - Callback=ProgressBar(target=data_file, total=file_size), - ExtraArgs={'ContentType': content_type} - ) + s3.Bucket(selected_area).upload_file(Filename=data_file, + Key=destination_file, + Callback=ProgressBar(target=data_file, total=file_size), + ExtraArgs={'ContentType': content_type, + 'Metadata': {'md5': hash_md5} + } + ) def upload_files(self, data_files, prefix): + selected_area = prefix with ThreadPoolExecutor() as executor: futures = { - executor.submit(self.upload_file, data_file, - f"{prefix}{os.path.basename(data_file)}"): data_file + executor.submit(self.upload_file, selected_area, data_file, + os.path.basename(data_file)): data_file for data_file in data_files } # collect each finished job success = True for future in concurrent.futures.as_completed(futures): + data_file = futures[future] # Get the associated data_file try: - data_file = futures[future] future.result() # read the result of the future object except Exception as ex: print(f"Exception raised for {data_file}: ", ex) @@ -75,15 +92,6 @@ def run(self): if not selected_area: return False, 'No area selected' - if self.aws.is_user: - dir_prefix = 'morphic-' + self.aws.center_name + '/' - - if dir_prefix not in selected_area: - selected_area = dir_prefix + selected_area - - if selected_area.rstrip(selected_area[-1]) not in self.aws.user_dir_list: - return False, "Upload area does not exist or you don't have access to this area" - try: ps = [] diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py new file mode 100644 index 0000000..3ba3533 --- /dev/null +++ b/ait/commons/util/command/view.py @@ -0,0 +1,54 @@ +from ait.commons.util.aws_client import Aws +from ait.commons.util.provider_api_util import ProviderApi +from ait.commons.util.user_profile import get_profile + + +class CmdView: + base_url = 'https://api.ingest.dev.archive.morphic.bio' + + def __init__(self, args): + self.args = args + self.access_token = get_profile('morphic-util').access_token + self.user_profile = get_profile('morphic-util') + self.provider_api = ProviderApi(self.base_url) + + if hasattr(self.args, 'dataset') and self.args.dataset is not None: + self.dataset = self.args.dataset + else: + print("Dataset is mandatory for view") + + def run(self): + fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{self.dataset}", + self.access_token) + print(f"Dataset fetched successfully: {self.dataset}") + print("Getting Biomaterials") + biomaterials = fetched_dataset.get('biomaterials', []) + + for biomaterial in biomaterials: + print(biomaterial) + + fetched_biomaterial = self.provider_api.get(f"{self.base_url}/biomaterials/{biomaterial}", + self.access_token) + print(fetched_biomaterial) + + print("Getting Processes") + processes = fetched_dataset.get('processes', []) + + for process in processes: + print(process) + + fetched_process = self.provider_api.get(f"{self.base_url}/processes/{process}", + self.access_token) + print(fetched_process) + + print("Getting Data Files") + files = fetched_dataset.get('files', []) + + for file in files: + print(files) + + fetched_file = self.provider_api.get(f"{self.base_url}/files/{file}", + self.access_token) + print(fetched_file) + + return True, "FETCHED SUCCESSFULLY" diff --git a/ait/commons/util/common.py b/ait/commons/util/common.py index 9345554..aced1e3 100755 --- a/ait/commons/util/common.py +++ b/ait/commons/util/common.py @@ -8,6 +8,7 @@ INGEST_UPLOAD_AREA_PREFIX = 's3://org-hca-data-archive-upload-' + def gen_uuid(): return str(uuid.uuid4()) diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py new file mode 100644 index 0000000..24a15fe --- /dev/null +++ b/ait/commons/util/provider_api_util.py @@ -0,0 +1,91 @@ +import requests + + +class ProviderApi: + def __init__(self, base_url): + self.base_url = base_url + + def request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): + """ + Sends an HTTP request to the specified URL with the given method. + + Parameters: + method (str): The HTTP method (GET, POST, PUT, DELETE). + url (str): The URL to send the request to. + access_token (str): Access token for authorization. + params (dict, optional): The URL parameters to be sent with the request. + data (dict, optional): The data to be sent in the request body. + data_type_in_hal_link (str, optional): The data type in the HAL link for extracting URL in POST response. + + Returns: + dict or str: The response data for PUT/DELETE or the URL for POST requests. + + Behavior + + Headers Setup: Constructs the request headers, setting Content-Type to application/json and adding the Authorization header with the provided access_token. + + Sending Request: Sends the HTTP request using the requests.request method with the specified method, URL, headers, params, and data. + + Response Handling: + Checks the status code of the response. + + If the status code is not one of 200, 201, 202, or 204, prints an error message and: + For DELETE requests, returns None. + For other methods, raises an exception using response.raise_for_status(). + + For POST requests with a data_type_in_hal_link provided, + returns the URL from the _links section of the response. + + For DELETE requests, returns the status code. + + For other successful requests, returns the JSON-parsed response data. + """ + # Construct the request headers + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {access_token}' + } + + # Send the HTTP request + response = requests.request(method, url, headers=headers, params=params, json=data) + status_code = response.status_code + + # Check for unsuccessful status codes + if status_code not in (200, 201, 202, 204): + print(f"Received {status_code} while executing {method} on {url}") + + if method == 'DELETE': + # Return None for unsuccessful DELETE requests + return None + else: + # Raise an exception for other unsuccessful requests + raise response.raise_for_status() + else: + print(f"Received {status_code} while executing {method} on {url}") + # Handle POST requests with data_type_in_hal_link + if method == 'POST' and data_type_in_hal_link: + response_data = response.json() + # Return the URL from the HAL link in the response + return response_data['_links'][data_type_in_hal_link]['href'] + elif method == 'DELETE': + # Return the status code for DELETE requests + return status_code + else: + # Return the JSON-parsed response data for other successful requests + return response.json() + + def put(self, url, access_token): + return self.request('PUT', url, access_token) + + def get(self, url, access_token): + return self.request('GET', url, access_token) + + def delete_with_relations(self, url, access_token, delete_linked_entities=False): + params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} + return self.request('DELETE', url, access_token, params=params) + + def delete(self, url, access_token): + return self.request('DELETE', url, access_token) + + def post(self, url, data_type_in_hal_link, data, access_token): + return self.request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index a13ef06..0683ef4 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,10 +1,10 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '0.0.8' -DESC = 'CLI tool for uploading data to Morphic AWS S3 bucket' -AUTHOR = 'hca-ingest-dev' -AUTHOR_EMAIL = 'hca-ingest-dev@ebi.ac.uk' +VERSION = '1.0.5-PROD' +DESC = 'CLI tool for submitting your analysis data and metadata' +AUTHOR = 'dgupta' +AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' # when true, displays exception details; otherwise user-friendly error message DEBUG_MODE = False @@ -37,8 +37,6 @@ COGNITO_IDENTITY_POOL_ID = 'eu-west-2:87ba188b-51fc-42e0-9172-a1a01cda8ed0' COGNITO_USER_POOL_ID = 'eu-west-2_2BpGQDRSU' IAM_USER = 'morphic-admin' -AWS_ACCOUNT = '596988661787' AWS_SECRET_NAME_AK_BUCKET = 'AK-bucket' AWS_SECRET_NAME_SK_BUCKET = 'SK-bucket' -AWS_SECRET_NAME_MORPHIC_BUCKET = 's3-bucket' diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py new file mode 100644 index 0000000..46a6abd --- /dev/null +++ b/ait/commons/util/spreadsheet_util.py @@ -0,0 +1,1315 @@ +import traceback + +import pandas as pd +import json +import numpy as np +import json + +""" +class MissingMandatoryFieldError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + +""" + + +class MissingParentEntityError: + + def add_error(self, missing_type, entity_type, missing_id, errors): + errors.append(f"Missing {missing_type} for {entity_type} and ID is {missing_id}") + + +class ValidationError(Exception): + def __init__(self, errors): + self.errors = errors + super().__init__(self._format_message()) + + def _format_message(self): + # This method formats the error message that will be displayed when the exception is raised. + return "Validation errors occurred:\n" + "\n".join(self.errors) + + +class SubmissionError(Exception): + """ + Exception raised for errors during submission. + Includes a list of errors and an optional underlying exception. + """ + + def __init__(self, errors, original_exception=None): + self.errors = errors + self.original_exception = original_exception # Store the original exception + super().__init__(self._format_message()) + + def _format_message(self): + """ + Format the error message to include both the list of submission errors and details of the original exception. + """ + message = "Submission errors occurred:\n" + "\n".join(self.errors) + if self.original_exception: + message += "\n\nOriginal Exception Details:\n" + message += f"Type: {type(self.original_exception).__name__}\n" + message += f"Message: {str(self.original_exception)}\n" + message += "Stack Trace:\n" + "".join(traceback.format_tb(self.original_exception.__traceback__)) + return message + + +""" +class OrphanedEntityError(Exception): + def __init__(self, type, id): + super().__init__(f"Orphaned entity {type} and ID is {id}") + self.type = type + self.id = id + +""" + + +class CellLine: + def __init__(self, + biomaterial_id, + description, + parental_cell_line_name, + clone_id, + protocol_id, + zygosity, + cell_type, + treatment_condition, + wt_control_status, + expression_alteration_id, + id): + self.biomaterial_id = biomaterial_id + self.description = description + self.parental_cell_line_name = parental_cell_line_name + self.clone_id = clone_id + self.protocol_id = protocol_id + self.zygosity = zygosity + self.cell_type = cell_type + self.treatment_condition = treatment_condition + self.wt_control_status = wt_control_status + self.differentiated_cell_lines = [] + self.expression_alteration_id = expression_alteration_id + self.id = id + + def add_differentiated_cell_line(self, differentiated_cell_line): + self.differentiated_cell_lines.append(differentiated_cell_line) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + content = { + "label": self.biomaterial_id, # matches 'label' in schema + "description": self.description, # matches 'description' in schema + "zygosity": self.zygosity, # matches 'zygosity' in schema + "type": self.cell_type, # matches 'type' in schema + "parental_cell_line_name": self.parental_cell_line_name # matches 'parental_cell_line_name' in schema + } + + # Optional fields - add them only if they are provided + if self.clone_id: + content["clone_id"] = self.clone_id # matches 'clone_id' in schema + + if self.protocol_id: + content[ + "cell_line_generation_protocol"] = self.protocol_id # matches 'cell_line_generation_protocol' in schema + + if self.treatment_condition: + content[ + "treatment_condition"] = self.treatment_condition # matches 'cell_line_generation_protocol' in schema + + if self.wt_control_status: + content[ + "wt_control_status"] = self.wt_control_status # matches 'cell_line_generation_protocol' in schema + + return { + "content": content + } + + +class ExpressionAlterationStrategy: + def __init__(self, + expression_alteration_id, + parent_protocol_id, + allele_specific, + altered_gene_symbol, + target_gene_hgnc_id, + targeted_genomic_region, + expected_alteration_type, + editing_strategy, + altered_locus, + guide_sequence, + method, + id): + self.expression_alteration_id = expression_alteration_id + self.parent_protocol_id = parent_protocol_id + self.allele_specific = allele_specific + self.altered_gene_symbol = altered_gene_symbol + self.target_gene_hgnc_id = target_gene_hgnc_id + self.targeted_genomic_region = targeted_genomic_region + self.expected_alteration_type = expected_alteration_type + self.editing_strategy = editing_strategy + self.altered_locus = altered_locus + self.guide_sequence = guide_sequence + self.method = method + self.id = id + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + return { + "content": { + "expression_alteration_id": self.expression_alteration_id, + "parent_protocol_id": self.parent_protocol_id, + "genes": [ + { + "allele_specific": self.allele_specific, + "altered_gene_symbol": self.altered_gene_symbol, + "target_gene_hgnc_id": self.target_gene_hgnc_id, + "targeted_genomic_region": self.targeted_genomic_region, + "expected_alteration_type": self.expected_alteration_type, + "editing_strategy": self.editing_strategy, + "altered_locus": self.altered_locus, + "guide_sequence": self.guide_sequence + } + ], + "method": self.method, + } + } + + +class DifferentiatedCellLine: + def __init__(self, + biomaterial_id, # Maps to 'label' + description, + cell_line_biomaterial_id, # Maps to 'clonal_cell_line_label' + differentiated_product_protocol_id, + undifferentiated_product_protocol_id, + terminally_differentiated, + model_system, + timepoint_value, + timepoint_unit, + treatment_condition=None, # New field as per schema + wt_control_status=None, # New field as per schema + id=None): # Optional, custom field + self.biomaterial_id = biomaterial_id # This maps to 'label' in the schema + self.description = description + self.cell_line_biomaterial_id = cell_line_biomaterial_id # Maps to 'clonal_cell_line_label' + self.differentiated_product_protocol_id = differentiated_product_protocol_id + self.undifferentiated_product_protocol_id = undifferentiated_product_protocol_id + self.terminally_differentiated = terminally_differentiated + self.model_system = model_system + self.timepoint_value = timepoint_value + self.timepoint_unit = timepoint_unit + self.treatment_condition = treatment_condition # Added to match schema + self.wt_control_status = wt_control_status # Added to match schema + self.library_preparations = [] + self.id = id # Custom field not in the schema + + def add_library_preparation(self, library_preparation): + self.library_preparations.append(library_preparation) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + content = { + "label": self.biomaterial_id, + "description": self.description, + "clonal_cell_line_id": self.cell_line_biomaterial_id, + "differentiated_product_protocol_id": self.differentiated_product_protocol_id, + "undifferentiated_product_protocol_id": self.undifferentiated_product_protocol_id, + "terminally_differentiated": self.terminally_differentiated, + "model_system": self.model_system, + "timepoint_value": self.timepoint_value, + "timepoint_unit": self.timepoint_unit, + } + + # Add optional fields only if they are provided + if self.treatment_condition: + content["treatment_condition"] = self.treatment_condition + + if self.wt_control_status: + content["wt_control_status"] = self.wt_control_status + + return { + "content": content + } + + +class LibraryPreparation: + def __init__(self, + biomaterial_id, + protocol_id, + differentiated_biomaterial_id, + average_fragment_size, + input_amount_value, + input_amount_unit, + final_yield_value, + final_yield_unit, + concentration_value, + concentration_unit, + pcr_cycles, + pcr_cycles_for_sample_index, + id): + self.biomaterial_id = biomaterial_id + self.protocol_id = protocol_id + self.differentiated_biomaterial_id = differentiated_biomaterial_id + self.average_fragment_size = average_fragment_size + self.input_amount_value = input_amount_value + self.input_amount_unit = input_amount_unit + self.final_yield_value = final_yield_value + self.final_yield_unit = final_yield_unit + self.concentration_value = concentration_value + self.concentration_unit = concentration_unit + self.pcr_cycles = pcr_cycles + self.pcr_cycles_for_sample_index = pcr_cycles_for_sample_index + self.sequencing_files = [] + self.id = id + + def add_sequencing_file(self, sequencing_file): + self.sequencing_files.append(sequencing_file) + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + # Helper function to handle invalid JSON values (e.g., NaN, infinite) + def convert_to_valid_json_value(value): + if isinstance(value, float) and (np.isnan(value) or not np.isfinite(value)): + return None + return value + + content = { + "label": self.biomaterial_id, + "library_preparation_protocol_id": self.protocol_id, + "average_fragment_size": convert_to_valid_json_value(self.average_fragment_size), + "input_amount_value": convert_to_valid_json_value(self.input_amount_value), + "input_amount_unit": self.input_amount_unit, + "total_yield_value": convert_to_valid_json_value(self.final_yield_value), + "total_yield_unit": self.final_yield_unit, + "concentration_value": convert_to_valid_json_value(self.concentration_value), + "concentration_unit": self.concentration_unit, + "pcr_cycles": self.pcr_cycles, + "pcr_cycles_for_sample_index": convert_to_valid_json_value(self.pcr_cycles_for_sample_index) + } + + # Add optional/custom fields if they are provided + if self.differentiated_biomaterial_id: + content["differentiated_biomaterial_id"] = self.differentiated_biomaterial_id + + return { + "content": content + } + + +class EntityType: + FILE = 'FILE' + + +class SequencingFile: + def __init__(self, + file_name, + extension, + read_index, + lane_index=None, + read_length=None, + checksum=None, + library_preparation_id=None, + sequencing_protocol_id=None, + run_id=None, + id=None): + self.file_name = file_name + self.extension = extension + self.read_index = read_index + self.lane_index = lane_index + self.read_length = read_length + self.checksum = checksum + self.library_preparation_id = library_preparation_id # Custom field + self.sequencing_protocol_id = sequencing_protocol_id # Custom field + self.run_id = run_id # Custom field + self.id = id # Custom field + + def __repr__(self): + return json.dumps(self.to_dict(), indent=2) + + def to_dict(self): + # Helper function to handle invalid JSON values + def convert_to_valid_json_value(value): + if isinstance(value, float) and (np.isnan(value) or not np.isfinite(value)): + return None + return value + + content = { + "label": self.file_name, + "extension": self.extension, + "read_index": self.read_index, + "lane_index": convert_to_valid_json_value(self.lane_index), + "read_length": convert_to_valid_json_value(self.read_length), + "checksum": self.checksum + } + + # Add optional/custom fields if they are provided + if self.library_preparation_id: + content["library_preparation_id"] = self.library_preparation_id # Not in schema, custom field + if self.sequencing_protocol_id: + content["sequencing_protocol_id"] = self.sequencing_protocol_id # Not in schema, custom field + if self.run_id: + content["run_id"] = self.run_id # Not in schema, custom field + + return { + "content": content, + "fileName": self.file_name + } + + +def find_orphans(source_entities, + target_entities, + source_attr, + target_attr, + source_type, + target_type, + errors): + """ + Validates that each source entity has a corresponding target entity. + + Parameters: + source_entities (list): The list of source entities. + target_entities (list): The list of target entities. + source_attr (str): The attribute name in the source entity to compare. + target_attr (str): The attribute name in the target entity to compare. + source_type (str): The type name of the source entity (for error messages). + target_type (str): The type name of the target entity (for error messages). + + Raises: + OrphanedEntityError: If a source entity doesn't have a corresponding target entity. + """ + for source_entity in source_entities: + match_found = False + + for target_entity in target_entities: + if getattr(target_entity, target_attr) == getattr(source_entity, source_attr): + match_found = True + break + + if not match_found: + errors.append(f"Orphaned entity {source_type} and ID is {getattr(source_entity, source_attr)}") + # raise OrphanedEntityError(source_type, getattr(source_entity, source_attr)) + + # print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") + + +def merge_library_preparation_sequencing_file(library_preparations, + sequencing_files, + errors): + """ + Merges library preparations and sequencing files based on their IDs. + + Parameters: + ----------- + library_preparations : list + A list of LibraryPreparation objects to be merged. + sequencing_files : list + A list of SequencingFile objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a sequencing file does not have a corresponding library preparation. + """ + find_orphans( + source_entities=library_preparations, + target_entities=sequencing_files, + source_attr="biomaterial_id", # Assuming this is the correct attribute + target_attr="library_preparation_id", + source_type="Library Preparation", + target_type="Sequencing File", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + library_ids = {lib_prep.biomaterial_id for lib_prep in library_preparations} + + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id not in library_ids: + missing_parent_entity_error.add_error("Library Preparation", + "Sequencing File", + sequencing_file.file_name, + errors) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if sequencing_file.library_preparation_id == library_preparation.biomaterial_id: + library_preparation.add_sequencing_file(sequencing_file) + + +def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, + library_preparations, + errors): + """ + Merges differentiated cell lines and library preparations based on their biomaterial IDs. + + Parameters: + ----------- + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + library_preparations : list + A list of LibraryPreparation objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a library preparation does not have a corresponding differentiated cell line. + """ + + find_orphans( + source_entities=differentiated_cell_lines, + target_entities=library_preparations, + source_attr="biomaterial_id", + target_attr="differentiated_biomaterial_id", + source_type="Differentiated Cell line", + target_type="Library Preparation", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + + differentiated_ids = {diff_cell.biomaterial_id for diff_cell in differentiated_cell_lines} + + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id not in differentiated_ids: + missing_parent_entity_error.add_error("Differentiated Cell Line", + "Library Preparation", + library_preparation.biomaterial_id, + errors) + + for differentiated_cell_line in differentiated_cell_lines: + for library_preparation in library_preparations: + if library_preparation.differentiated_biomaterial_id == differentiated_cell_line.biomaterial_id: + differentiated_cell_line.add_library_preparation(library_preparation) + + +def merge_cell_line_and_differentiated_cell_line(cell_lines, + differentiated_cell_lines, + errors): + """ + Merges cell lines and differentiated cell lines based on their biomaterial IDs. + + Parameters: + ----------- + cell_lines : list + A list of CellLine objects to be merged. + differentiated_cell_lines : list + A list of DifferentiatedCellLine objects to be merged. + + Returns: + -------- + None + + Raises: + ------ + MissingEntityError: + If a differentiated cell line does not have a corresponding cell line. + """ + + try: + find_orphans( + source_entities=cell_lines, + target_entities=differentiated_cell_lines, + source_attr="biomaterial_id", + target_attr="cell_line_biomaterial_id", + source_type="Cell line", + target_type="Differentiated Cell line", + errors=errors + ) + + missing_parent_entity_error = MissingParentEntityError() + cell_line_ids = {cell_line.biomaterial_id for cell_line in cell_lines} + + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.cell_line_biomaterial_id not in cell_line_ids: + missing_parent_entity_error.add_error("Cell Line", + "Differentiated Cell line", + differentiated_cell_line.biomaterial_id, + errors) + + for cell_line in cell_lines: + for differentiated_cell_line in differentiated_cell_lines: + if differentiated_cell_line.cell_line_biomaterial_id == cell_line.biomaterial_id: + cell_line.add_differentiated_cell_line(differentiated_cell_line) + except Exception as e: + print(f"Exception occurred here:", e) + + +class SpreadsheetSubmitter: + """ + A class for parsing and processing data from an Excel spreadsheet containing information about + cell lines, differentiated cell lines, library preparations, and sequencing files. + + Attributes: + ---------- + file_path : str + The file path to the Excel spreadsheet. + + Methods: + ------- + list_sheets() + Retrieves the names of all sheets present in the Excel file. + + parse_cell_lines(sheet_name, column_mapping) + Parses data related to cell lines from a specified sheet in the Excel file. + + parse_differentiated_cell_lines(sheet_name, column_mapping) + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + parse_library_preparations(sheet_name, column_mapping) + Parses data related to library preparations from a specified sheet in the Excel file. + + parse_sequencing_files(sheet_name, column_mapping) + Parses data related to sequencing files from a specified sheet in the Excel file. + + get_cell_lines(sheet_name, column_mapping) + Retrieves parsed cell lines data from a specified sheet in the Excel file. + + get_differentiated_cell_lines(sheet_name, column_mapping) + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines) + Merges cell lines and differentiated cell lines based on their biomaterial IDs. + + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations) + Merges differentiated cell lines and library preparations based on their biomaterial IDs. + + merge_library_preparation_sequencing_file(library_preparations, sequencing_files) + Merges library preparations and sequencing files based on their IDs. + + get_library_preparations(sheet_name, column_mapping) + Retrieves parsed library preparations data from a specified sheet in the Excel file. + + get_sequencing_files(sheet_name, column_mapping) + Retrieves parsed sequencing files data from a specified sheet in the Excel file. + """ + + def __init__(self, file_path): + """ + Initializes a SpreadsheetSubmitter instance with the given file path. + + Parameters: + ----------- + file_path : str + The file path to the Excel spreadsheet. + """ + self.file_path = file_path + + def list_sheets(self): + """ + Retrieves the names of all sheets present in the Excel file, + trimming any leading or trailing spaces. + + Returns: + -------- + list + A list of trimmed sheet names present in the Excel file. + """ + xls = pd.ExcelFile(self.file_path, engine='openpyxl') + return [sheet_name.strip() for sheet_name in xls.sheet_names] + + def input_file_to_data_frames(self, sheet_name, action): + if action.upper() == 'MODIFY': + skip_rows = 0 + else: + skip_rows = 3 + + # Load the Excel file to retrieve all sheet names + with pd.ExcelFile(self.file_path, engine='openpyxl') as xls: + # Trim spaces from sheet names + sheet_names = {sheet.strip(): sheet for sheet in xls.sheet_names} + + # Attempt to find the trimmed sheet name in the list + trimmed_sheet_name = sheet_name.strip() + + if trimmed_sheet_name in sheet_names: + # Read the sheet using the original sheet name (with spaces if they existed) + df = pd.read_excel(self.file_path, sheet_name=sheet_names[trimmed_sheet_name], engine='openpyxl', + skiprows=skip_rows) + else: + raise ValidationError(f"Sheet '{sheet_name}' not found in the spreadsheet.") + + return df + + def parse_cell_lines(self, + sheet_name, + action, + errors): + """ + Parses data related to cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing cell line data. + + Returns: + -------- + tuple + A tuple containing: + - list of CellLine objects parsed from the specified sheet. + - pd.DataFrame with the parsed data. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + parent_cell_line_names = [] + + # Check if the required column exists + if 'clonal_cell_line.label' not in df.columns: + errors.append( + f"The column 'clonal_cell_line.label' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['clonal_cell_line.label'].notna()] + # Replace invalid float values with None + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for invalid starting values + cols_to_check = ['clonal_cell_line.label'] + invalid_start_values = ( + 'FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'cell_line.biomaterial_core.biomaterial_id' + ) + # Filter out rows with invalid starting values + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith(invalid_start_values)).all(axis=1) + df_filtered = df[mask] + # Check for a unique value in 'cell_line.derived_cell_line_accession' + derived_col = 'clonal_cell_line.parental_cell_line_name' + + if derived_col in df_filtered.columns: + parent_cell_line_names = df_filtered[derived_col].dropna().unique() + + """ + if len(parent_cell_line_names) != 1: + errors.append( + f"The column '{derived_col}' must have the same value across all rows. Found values: {parent_cell_line_names}") + + return [], df + """ + + # Process rows to create CellLine objects + cell_lines = [] + + for _, row in df_filtered.iterrows(): + label = row['clonal_cell_line.label'] + parental_cell_line_name = row.get('clonal_cell_line.parental_cell_line_name') + cell_type = row.get('clonal_cell_line.type') + expression_alteration_id = row.get('expression_alteration.label') + + # Error handling for missing mandatory fields + if pd.isnull(label): + errors.append("Biomaterial ID cannot be null in any row of the Cell line/ Clonal cell line sheet.") + + if any(pd.isnull(field) for field in [parental_cell_line_name, cell_type]): + errors.append( + f"Mandatory fields (parental_cell_line_name, clonal_cell_line.type, expression_alteration.label) are required for Cell " + f"line/ Clonal cell line entity: {label}") + + cell_lines.append( + CellLine( + biomaterial_id=label, + description=row.get('clonal_cell_line.description'), + parental_cell_line_name=parental_cell_line_name, + clone_id=row.get('clonal_cell_line.clone_id'), + protocol_id=row.get('clonal_cell_line.cell_line_generation_protocol'), + zygosity=row.get('clonal_cell_line.zygosity'), + cell_type=cell_type, + expression_alteration_id=expression_alteration_id, + wt_control_status=row.get('clonal_cell_line.wt_control_status'), + treatment_condition=row.get('clonal_cell_line.treatment_condition'), + id=row.get('Id') + ) + ) + + return cell_lines, df_filtered, parent_cell_line_names + + def parse_differentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + + # Check if the required column exists + if 'differentiated_product.label' not in df.columns: + errors.append(f"The column 'differentiated_product.label' does not " + f"exist in {sheet_name} name. The rest of the file will not be processed") + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['differentiated_product.label'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['differentiated_product.label'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + # Check for mandatory fields and create Differentiated CellLine objects + differentiated_cell_lines = [] + + for _, row in df_filtered.iterrows(): + label = row['differentiated_product.label'] + parent_biomaterial_id = row.get('clonal_cell_line.label') + + # Check if biomaterial_id is null + if pd.isnull(label): + errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " + "sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") + + # Check if derived_accession and cell_type are present + if pd.isnull(parent_biomaterial_id): + errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " + f"{label}") + """ + raise MissingMandatoryFieldError( + "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + """ + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + differentiated_cell_lines.append( + DifferentiatedCellLine( + biomaterial_id=label, + description=row.get('differentiated_product.description'), + cell_line_biomaterial_id=parent_biomaterial_id, + differentiated_product_protocol_id=row.get( + 'differentiated_product.differentiated_product_protocol_id'), + undifferentiated_product_protocol_id=None, + treatment_condition=row.get('differentiated_product.treatment_condition'), + wt_control_status=row.get('differentiated_product.wt_control_status'), + timepoint_value=row.get('differentiated_product.timepoint_value'), + timepoint_unit=row.get('differentiated_product.timepoint_unit'), + terminally_differentiated=row.get('differentiated_product.terminally_differentiated'), + model_system=row.get('differentiated_product.model_system'), + id=row.get('Id') + ) + ) + + return differentiated_cell_lines, df_filtered + + # TODO: review + def parse_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + + # Check if the required column exists + if 'undifferentiated_product.label' not in df.columns: + errors.append(f"The column 'undifferentiated_product.label' does not " + f"exist in {sheet_name} name. The rest of the file will not be processed") + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['undifferentiated_product.label'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['undifferentiated_product.label'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + # Check for mandatory fields and create Differentiated CellLine objects + undifferentiated_cell_lines = [] + + for _, row in df_filtered.iterrows(): + label = row['undifferentiated_product.label'] + parent_biomaterial_id = row.get('clonal_cell_line.label') + + # Check if biomaterial_id is null + if pd.isnull(label): + errors.append( + "Undifferentiated Cell line ID cannot be null in any row of the Undifferentiated Cell line " + "sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") + + # Check if derived_accession and cell_type are present + if pd.isnull(parent_biomaterial_id): + errors.append(f"Input Cell line ID cannot be null for Undifferentiated Cell line: " + f"{label}") + """ + raise MissingMandatoryFieldError( + "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + """ + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + undifferentiated_cell_lines.append( + DifferentiatedCellLine( + biomaterial_id=label, + description=row.get('undifferentiated_product.description'), + cell_line_biomaterial_id=parent_biomaterial_id, + differentiated_product_protocol_id=None, + undifferentiated_product_protocol_id=row.get( + 'undifferentiated_product.undifferentiated_product_protocol_id'), + treatment_condition=row.get('undifferentiated_product.treatment_condition'), + wt_control_status=row.get('undifferentiated_product.wt_control_status'), + timepoint_value=row.get('undifferentiated_product.timepoint_value'), + timepoint_unit=row.get('undifferentiated_product.timepoint_unit'), + terminally_differentiated=row.get('undifferentiated_product.terminally_differentiated'), + model_system=row.get('undifferentiated_product.model_system'), + id=row.get('Id') + ) + ) + + return undifferentiated_cell_lines, df_filtered + + def parse_library_preparations(self, + sheet_name, + differentiated, + action, + errors): + """ + Parses data related to library preparations from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing library preparation data. + + Returns: + -------- + list + A list of LibraryPreparation objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + # Check if the required column exists + required_columns = [ + 'library_preparation.label', + 'differentiated_product.label', + 'undifferentiated_product.label', + 'library_preparation.library_preparation_protocol_id' + ] + + for col in required_columns: + if col not in df.columns: + if col == 'differentiated_product.label' and differentiated: + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df + elif col == 'undifferentiated_product.label' and not differentiated: + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df + else: + if col not in ('differentiated_product.label', 'undifferentiated_product.label'): + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['library_preparation.label'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['library_preparation.label'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'library_preparation.biomaterial_core.biomaterial_id'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + # Check for mandatory fields and create Library Preparation objects + library_preparations = [] + + for _, row in df_filtered.iterrows(): + label = row['library_preparation.label'] + if differentiated: + differentiated_biomaterial_label = row.get('differentiated_product.label') + else: + differentiated_biomaterial_label = row.get('undifferentiated_product.label') + library_preparation_protocol_id = row.get('library_preparation.library_preparation_protocol_id') + + # Check if required fields are null + if pd.isnull(label): + errors.append("Library Preparation ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") + if pd.isnull(differentiated_biomaterial_label): + if differentiated: + errors.append( + "Differentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") + else: + errors.append( + "Undifferentiated Cell Line ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell Line ID cannot be null in any row.") + if pd.isnull(library_preparation_protocol_id): + errors.append( + "Library Preparation Protocol ID cannot be null in any row of the Library Preparation sheet.") + # raise MissingMandatoryFieldError("Library Preparation Protocol ID cannot be null in any row.") + + # Create LibraryPreparation objects from filtered DataFrame rows + library_preparations.append( + LibraryPreparation( + biomaterial_id=label, + protocol_id=library_preparation_protocol_id, + differentiated_biomaterial_id=differentiated_biomaterial_label, + average_fragment_size=row.get('library_preparation.average_fragment_size'), + input_amount_value=row.get('library_preparation.input_amount_value'), + input_amount_unit=row.get('library_preparation.input_amount_unit'), + final_yield_value=row.get('library_preparation.final_yield_value'), + final_yield_unit=row.get('library_preparation.final_yield_unit'), + concentration_value=row.get('library_preparation.concentration_value'), + concentration_unit=row.get('library_preparation.concentration_unit'), + pcr_cycles=row.get('library_preparation.pcr_cycles'), + pcr_cycles_for_sample_index=row.get('library_preparation.pcr_cycles_for_sample_index'), + id=row.get('Id') + ) + ) + + return library_preparations, df_filtered + + def parse_sequencing_files(self, + sheet_name, + action, + errors): + """ + Parses data related to sequencing files from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing sequencing file data. + + Returns: + -------- + list + A list of SequencingFile objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + + # Check if the required column exists + required_columns = [ + 'sequence_file.label', + 'library_preparation.label', + 'sequence_file.extension', + 'sequence_file.read_index' + ] + + for col in required_columns: + if col not in df.columns: + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") + + return [], df + + # Filter rows where file_name is not null + df = df[df['sequence_file.label'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['sequence_file.label'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'The name of the file.', + 'Include the file extension in the file name. For example: R1.fastq.gz; codebook.json', + 'sequence_file.label'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + + # Check for mandatory fields and create Sequencing file objects + sequencing_files = [] + + for _, row in df_filtered.iterrows(): + file_name = row['sequence_file.label'] + library_preparation_id = row.get('library_preparation.label') + read_index = row.get('sequence_file.read_index') + + # Check if required fields are null + if pd.isnull(file_name): + errors.append("Sequence file name cannot be null in any row of the Sequencing File sheet.") + # raise MissingMandatoryFieldError("Sequence file name cannot be null in any row.") + if pd.isnull(library_preparation_id): + errors.append("Library Preparation ID cannot be null in any row of the Sequencing File sheet..") + # raise MissingMandatoryFieldError("Library Preparation ID cannot be null in any row.") + if pd.isnull(read_index): + errors.append("Read Index cannot be null in any row of the Sequencing File sheet..") + # raise MissingMandatoryFieldError("Read Index cannot be null in any row.") + + # Create SequencingFile objects from filtered DataFrame rows + sequencing_files.append( + SequencingFile( + file_name=file_name, + extension=None, + read_index=read_index, + lane_index=None, + read_length=None, + checksum=None, + library_preparation_id=library_preparation_id, + run_id=row.get('sequence_file.run_id'), + id=row.get('Id') + ) + ) + + return sequencing_files, df_filtered + + def parse_expression_alteration(self, + sheet_name, + action, + errors): + """ + Parses data related to expression alterations from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing expression alterations data. + action : str + The action to be performed on the data. + errors : list + A list to accumulate error messages. + + Returns: + -------- + tuple + A tuple containing: + - A list of ExpressionAlterationStrategy objects parsed from the specified sheet (if valid) + - The filtered DataFrame of the parsed data + - A boolean indicating whether the expression alteration strategy sheet exists and is valid + """ + # Attempt to parse the input file into a DataFrame + try: + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + except Exception as e: + errors.append(f"Missing sheet '{sheet_name}': {e}") + return [], None, False + + # Strip whitespace from column names + df.columns = df.columns.str.strip() + + # Check if the required column exists + required_columns = ['expression_alteration.label'] + missing_columns = [col for col in required_columns if col not in df.columns] + + if missing_columns: + errors.append( + f"The following required columns are missing in the Expression Alteration Strategy sheet: {', '.join(missing_columns)}") + return [], df, False # Return if required columns are missing + + # Filter rows where 'expression_alteration.label' is not null + df = df[df['expression_alteration.label'].notna()] + # Replace invalid float values (e.g., NaN, infinite) with None + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + + # Define unwanted patterns to filter out unwanted rows + unwanted_patterns = ( + 'FILL OUT INFORMATION BELOW THIS ROW', + 'A unique ID for the gene expression alteration instance..', + 'ID should have no spaces. For example: JAXPE0001_MEIS1, MSKKI119_MEF2C, NWU_AID' + ) + + # Create a mask to filter out rows with unwanted starting values + mask = df['expression_alteration.label'].astype(str).str.startswith(unwanted_patterns) + df_filtered = df[~mask] + + # Initialize the list of ExpressionAlterationStrategy objects + expression_alterations = [] + + for _, row in df_filtered.iterrows(): + expression_alterations.append( + ExpressionAlterationStrategy( + expression_alteration_id=row.get('expression_alteration.label'), + parent_protocol_id=row.get('expression_alteration.parent_protocol_id'), + allele_specific=row.get('expression_alteration.genes.allele_specific'), + altered_gene_symbol=row.get('expression_alteration.genes.altered_gene_symbol'), + target_gene_hgnc_id=row.get('expression_alteration.genes.target_gene_hgnc_id'), + targeted_genomic_region=row.get('expression_alteration.genes.targeted_genomic_region'), + expected_alteration_type=row.get('expression_alteration.genes.expected_alteration_type'), + editing_strategy=row.get('expression_alteration.genes.editing_strategy'), + altered_locus=row.get('expression_alteration.genes.altered_locus'), # No longer a placeholder + guide_sequence=row.get('expression_alteration.genes.guide_sequence'), # No longer a placeholder + method=row.get('expression_alteration.method'), + id=row.get('Id') + ) + ) + + # Return the list of objects, the filtered DataFrame, and a flag indicating success + return expression_alterations, df_filtered + + def get_cell_lines(self, + sheet_name, + action, + errors): + """ + Retrieves parsed cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of CellLine objects parsed from the specified sheet. + """ + cell_lines, cell_lines_df, parent_cell_line_names = self.parse_cell_lines(sheet_name, action, errors) + return cell_lines, cell_lines_df, parent_cell_line_names + + def get_differentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name, + action, errors) + return differentiated_cell_lines, differentiated_cell_lines_df + + def get_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + undifferentiated_cell_lines, undifferentiated_cell_lines_df = self.parse_undifferentiated_cell_lines(sheet_name, + action, + errors) + return undifferentiated_cell_lines, undifferentiated_cell_lines_df + + def get_library_preparations(self, + sheet_name, + differentiated, + action, + errors): + """ + Retrieves parsed library preparations data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing library preparation data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of LibraryPreparation objects parsed from the specified sheet. + """ + library_preparations, df_filtered = self.parse_library_preparations(sheet_name, differentiated, + action, errors) + return library_preparations, df_filtered + + def get_sequencing_files(self, + sheet_name, + action, + errors): + """ + Retrieves parsed sequencing files data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing sequencing file data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of SequencingFile objects parsed from the specified sheet. + """ + sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, errors) + return sequencing_files, df_filtered + + def get_expression_alterations(self, + sheet_name, + action, + errors): + expression_alterations, df_filtered = self.parse_expression_alteration(sheet_name, action, errors) + return expression_alterations, df_filtered diff --git a/ait/commons/util/tests/command/test_create.py b/ait/commons/util/tests/command/test_create.py index b389528..3a47a2a 100644 --- a/ait/commons/util/tests/command/test_create.py +++ b/ait/commons/util/tests/command/test_create.py @@ -3,7 +3,7 @@ from ait.commons.util.__main__ import parse_args from ait.commons.util.cmd import Cmd -from ait.commons.util.command.create import CmdCreate +from ait.commons.util.command.create import run class TestCreate(unittest.TestCase): @@ -32,7 +32,7 @@ def test_create_upload_area_no_config_display_error(self): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(None, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) @@ -56,7 +56,7 @@ def test_user_create_upload_area_has_valid_config(self): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) @@ -70,7 +70,7 @@ def test_admin_create_upload_area(self, uuid): args = ['create', 'testUploadArea'] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertTrue(success) @@ -88,7 +88,7 @@ def test_admin_create_upload_area_with_permissions(self, uuid): args = ['create', upload_area_name, '-p', permission] # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertTrue(success) @@ -103,7 +103,7 @@ def test_admin_create_upload_area_no_name(self): # when with self.assertRaises(SystemExit) as error: parsed_args = parse_args(args) - success, msg = CmdCreate(self.aws_mock, parsed_args).run() + success, msg = run() self.assertFalse(parsed_args) self.assertFalse(success) self.assertFalse(msg) @@ -133,7 +133,7 @@ def test_admin_create_upload_area_has_exception(self, uuid): self.client.put_object.side_effect = Mock(side_effect=Exception('Test')) # when - success, msg = CmdCreate(self.aws_mock, parse_args(args)).run() + success, msg = run() # then self.assertFalse(success) diff --git a/ait/commons/util/user_profile.py b/ait/commons/util/user_profile.py index acc1c11..3f44eb6 100755 --- a/ait/commons/util/user_profile.py +++ b/ait/commons/util/user_profile.py @@ -9,8 +9,10 @@ def __init__(self): self.access_key = None self.secret_key = None self.session_token = None + self.access_token = None self.username = None self.password = None + self.idToken = None self.region = None def __repr__(self): @@ -42,6 +44,7 @@ def get_profile(profile): user_profile.access_key = credentials[profile].get('aws_access_key_id') user_profile.secret_key = credentials[profile].get('aws_secret_access_key') user_profile.session_token = credentials[profile].get('aws_session_token') + user_profile.access_token = credentials[profile].get('aws_cognito_access_token') user_profile.username = credentials[profile].get('aws_cognito_username') user_profile.password = credentials[profile].get('aws_cognito_password') @@ -57,7 +60,7 @@ def get_profile(profile): return user_profile -def set_profile(profile, region, access_key, secret_key, session_token, username, password): +def set_profile(profile, region, access_key, secret_key, session_token, access_token, username, password): """.aws/config [profile {profile}] region = {region} @@ -91,6 +94,7 @@ def set_profile(profile, region, access_key, secret_key, session_token, username credentials.add_section(f'{profile}') credentials.set(f'{profile}', 'aws_cognito_username', username) credentials.set(f'{profile}', 'aws_cognito_password', password) + credentials.set(f'{profile}', 'aws_cognito_access_token', access_token) with open(AWS_CREDENTIALS_FILE, 'w') as out: credentials.write(out) diff --git a/requirements.txt b/requirements.txt index bddbdc3..82b6911 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ -boto3>=1.26.153 -botocore>=1.29.153 +boto3>=1.23.10 +botocore>=1.26.10 filetype==1.0.7 requests>=2.20.0, <3 urllib3<1.27, >=1.25.4 -tqdm \ No newline at end of file +tqdm~=4.57.0 +pandas~=2.2.2 +setuptools~=59.6.0 +openpyxl==3.1.3 \ No newline at end of file diff --git a/setup.py b/setup.py index 49ee058..2b9c530 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,31 @@ from setuptools import setup from ait.commons.util.settings import NAME, VERSION, DESC, AUTHOR, AUTHOR_EMAIL -# directory containing this file +# Directory containing this file HERE = pathlib.Path(__file__).parent -# text of the README file +# Text of the README file README = (HERE / 'README.md').read_text() -# install requirements +# Install requirements from requirements.txt INSTALL_REQS = [line.rstrip() for line in open(os.path.join(os.path.dirname(__file__), 'requirements.txt'))] +# Additional install requirements +ADDITIONAL_REQS = [ + 'boto3>=1.23.10', + 'botocore>=1.26.10', + 'filetype==1.0.7', + 'requests>=2.20.0, <3', + 'urllib3<1.27, >=1.25.4', + 'tqdm~=4.57.0', + 'pandas~=2.2.2', + 'setuptools~=59.6.0', + 'openpyxl==3.1.3' +] + +# Combine the install requirements +ALL_REQS = INSTALL_REQS + ADDITIONAL_REQS + # This call to setup() does all the work setup( # dashes are ok in repo and PyPI dist names but not in package (i.e. directory) and @@ -25,18 +41,18 @@ author=AUTHOR, author_email=AUTHOR_EMAIL, license='Apache License', - python_requires='>=3.6', + python_requires='>=3.10', classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Operating System :: MacOS :: MacOS X', 'Operating System :: POSIX', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], platforms=['MacOS X', 'Posix'], - packages=['ait.commons.util','ait.commons.util.settings', 'ait.commons.util.command'], + packages=['ait.commons.util', 'ait.commons.util.settings', 'ait.commons.util.command'], include_package_data=True, - install_requires=INSTALL_REQS, + install_requires=ALL_REQS, entry_points={ 'console_scripts': [ f'{NAME}=ait.commons.util.__main__:main',