From 8001ad1117020b476d9c3333ca7ecb9db6c231a0 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Sat, 31 Aug 2024 12:32:28 +0100 Subject: [PATCH 1/8] code clean-up --- ait/commons/util/command/submit.py | 209 +++++++++++++++--------- ait/commons/util/command/submit_file.py | 176 +++++++++++++++----- ait/commons/util/spreadsheet_util.py | 36 +++- 3 files changed, 298 insertions(+), 123 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 34d1794..397290e 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -11,7 +11,7 @@ from ait.commons.util.provider_api_util import APIProvider -def equality(cell_line, expression_alteration): +def matching_expression_alteration_and_cell_line(cell_line, expression_alteration): return expression_alteration.expression_alteration_id.replace(" ", "").strip() == cell_line.expression_alteration_id.replace( " ", "").strip() @@ -273,8 +273,9 @@ def create_cell_line_entity(self, cell_line, expression_alterations, submission_ access_token ) - self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, - expression_alterations) + if expression_alterations is not None: + self.link_cell_line_with_expression_alterations(access_token, cell_line, cell_line_entity_id, + expression_alterations) print(f"Linking Cell Line Biomaterial: {cell_line.biomaterial_id} to dataset {dataset_id}") @@ -286,7 +287,7 @@ def link_cell_line_with_expression_alterations(self, access_token, cell_line, ce expression_alterations): for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: - if equality(cell_line, expression_alteration): + if matching_expression_alteration_and_cell_line(cell_line, expression_alteration): print(f"Linking cell line {cell_line_entity_id} " f"as derived by process of {expression_alteration.expression_alteration_id}") @@ -337,57 +338,121 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce return differentiated_cell_line_id def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_id, dataset_id, - differentiated_cell_line, - submission_envelope_id): - print("Cell line has differentiated cell lines, creating differentiation process to link them") - - differentiation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('differentiation'), - submission_envelope_id) - print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " - f"as a child of Cell line: {cell_line_entity_id}") - - differentiated_entity_id = self.create_child_biomaterial( - cell_line_entity_id, - differentiated_cell_line.to_dict(), - access_token - ) + differentiated_cell_line, submission_envelope_id): + """ + Creates a Differentiated Cell Line entity and links it to the submission envelope. - print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"to envelope: {submission_envelope_id}") + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + dataset_id : str + The dataset ID to link with. + differentiated_cell_line : object + The differentiated cell line object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. - self.link_entity_to_envelope( - 'biomaterial', - differentiated_entity_id, - submission_envelope_id, - access_token - ) + Returns: + -------- + str + The ID of the created differentiated cell line entity. + """ + + # Create the differentiated cell line biomaterial + if cell_line_entity_id is not None: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id} " + f"as a child of Cell line: {cell_line_entity_id}") + differentiated_entity_id = self.create_child_biomaterial( + cell_line_entity_id, + differentiated_cell_line.to_dict(), + access_token + ) + + print(f"Created Differentiated Cell Line Biomaterial: {differentiated_entity_id}") + print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " + f"to envelope: {submission_envelope_id}") + + # Link the differentiated cell line entity to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + differentiated_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Differentiated Cell Line Biomaterial: {differentiated_cell_line.biomaterial_id}") + differentiated_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + differentiated_cell_line.to_dict(), + submission_envelope_id, + access_token + ) print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " f"to dataset: {dataset_id}") + # Link the differentiated cell line to the dataset self.link_to_dataset('biomaterial', dataset_id, differentiated_entity_id, access_token) - print(f"Linking Cell Line Biomaterial: {cell_line_entity_id} as " - f"input to process : {differentiation_process_entity_id}") + return differentiated_entity_id - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_entity_id, differentiated_entity_id, + dataset_id, submission_envelope_id, action): + """ + Creates and links the differentiation process between the original cell line and the differentiated cell line. - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + Parameters: + ----------- + access_token : str + The authentication token. + cell_line_entity_id : str + The ID of the original cell line entity. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) + Returns: + -------- + str + The ID of the differentiation process entity created. + """ + if action.lower() != 'modify': + print("Cell line has differentiated cell lines, creating differentiation process to link them") + + # Create a differentiation process entity + differentiation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id + ) - return differentiated_entity_id + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + + # Link the cell line entity as input to the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") + + # Link the differentiated cell line entity as derived by the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + return differentiation_process_entity_id def handle_library_preparation(self, differentiated_entity_id, library_preparation, library_preparations_df, submission_envelope_id, @@ -577,6 +642,7 @@ def multi_type_submission(self, cell_lines, expression_alterations, cell_lines_df, + differentiated_cell_lines, differentiated_cell_lines_df, library_preparations_df, sequencing_file_df, @@ -605,45 +671,38 @@ def multi_type_submission(self, try: for cell_line in cell_lines: - cell_line_entity_id = self.handle_cell_line(cell_line, - expression_alterations, - cell_lines_df, + if cell_line.id is not None: + cell_line_entity_id = cell_line.id + + for differentiated_cell_line in cell_line.differentiated_cell_lines: + differentiated_cell_line_entity_id = differentiated_cell_line.id + + self.link_cell_line_and_differentiated_cell_line(access_token, cell_line_entity_id, + differentiated_cell_line_entity_id, + dataset_id, submission_envelope_id + , action) + + for library_preparation in differentiated_cell_line.library_preparations: + library_preparation_entity_id = self.handle_library_preparation( + differentiated_cell_line_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors) + + for sequencing_file in library_preparation.sequencing_files: + self.handle_sequencing_file(library_preparation_entity_id, + sequencing_file, + sequencing_file_df, submission_envelope_id, dataset_id, access_token, action, errors) - for differentiated_cell_line in cell_line.differentiated_cell_lines: - differentiated_entity_id = self.handle_differentiated_cell_line(cell_line_entity_id, - differentiated_cell_line, - differentiated_cell_lines_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for library_preparation in differentiated_cell_line.library_preparations: - library_preparation_entity_id = self.handle_library_preparation(differentiated_entity_id, - library_preparation, - library_preparations_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for sequencing_file in library_preparation.sequencing_files: - self.handle_sequencing_file(library_preparation_entity_id, - sequencing_file, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - message = 'SUCCESS' except Exception as e: message = f"An error occurred: {str(e)}" diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 9c6df64..f3d7f6a 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -37,10 +37,9 @@ def get_content(unique_value): return {"content": unique_value} -def create_expression_alterations(submission_instance, submission_envelope_id, access_token, parsed_data): - expression_alterations = parsed_data['expression_alterations'] - expression_alterations_df = parsed_data['expression_alterations_df'] - +def _create_expression_alterations(submission_instance, submission_envelope_id, access_token, + expression_alterations, + expression_alterations_df): expression_alterations_entity_id_column_name = "Id" if expression_alterations_entity_id_column_name not in expression_alterations_df.columns: @@ -168,32 +167,79 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) """Process the file submission.""" parser = SpreadsheetSubmitter(self.file) parsed_data = self._parse_spreadsheet(parser) - self._validate_and_upload(parsed_data, submission_instance, list_of_files_in_upload_area) - # original expression alteration data frame - expression_alteration_df = parsed_data['expression_alterations_df'] + self._validate_and_upload(parsed_data, list_of_files_in_upload_area) + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] parent_cell_line_name = parsed_data['parent_cell_line_name'] + cell_lines = parsed_data['cell_lines'] + cell_lines_df = parsed_data['cell_lines_df'] + differentiated_cell_lines = parsed_data['differentiated_cell_lines'] + differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] # TODO: Handle expression alterations in MODIFY if self._is_add_action(): self._create_submission_envelope(submission_instance) - print(f"Creating parental cell line with name {parent_cell_line_name}") - parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + parent_cell_line_id = None + created_expression_alterations = [] - print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + if parent_cell_line_name is not None: + print(f"Creating parental cell line with name {parent_cell_line_name}") + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + # TODO: link parental cell line with dataset - created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( - submission_instance, parsed_data) - self.link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations - ) + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") - updated_dfs, message = self._perform_main_submission(submission_instance, parsed_data) + if expression_alterations and expression_alterations_df is not None: + created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df) - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alteration_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if created_expression_alterations and parent_cell_line_id is not None: + self.link_parent_cell_line_expression_alteration( + submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + ) + + created_cell_lines = [] + + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df) + + created_differentiated_cell_lines = [] + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, + cell_lines_df, created_differentiated_cell_lines, + differentiated_cell_lines_df, parsed_data) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) + elif self._is_modify_action(): + created_cell_lines = [] + + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df) + + created_differentiated_cell_lines = [] + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, + cell_lines_df, created_differentiated_cell_lines, + differentiated_cell_lines_df, parsed_data) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) def _parse_spreadsheet(self, parser): try: @@ -236,20 +282,39 @@ def _parse_spreadsheet(self, parser): self.validation_errors.append(f"Spreadsheet is invalid {self.file}") return None - def _validate_and_upload(self, parsed_data, submission_instance, list_of_files_in_upload_area): + def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): + """ # Validate the parsed data and upload the file. validate_sequencing_files(parsed_data['sequencing_files'], list_of_files_in_upload_area, self.dataset, self.validation_errors) - + """ + """ + Handle validation errors, including interacting with the user in case of a missing sheet. + """ try: - # exit now if there are validation errors in the spreadsheet + # Exit now if there are validation errors in the spreadsheet if self.validation_errors: raise ValidationError(self.validation_errors) except ValidationError as e: - # Print the error message - print(e) - # Exit the program with a non-zero status code to indicate an error - sys.exit(1) + # Check if the error is related to a missing sheet + missing_sheet_errors = [msg for msg in self.validation_errors if "Missing sheet" in msg] + + if missing_sheet_errors: + # Extract the sheet name(s) from the errors + missing_sheets = ', '.join([msg.split("'")[1] for msg in missing_sheet_errors]) + # Ask the user whether to proceed + user_response = input( + f"A required sheet '{missing_sheets}' is missing. Do you want to proceed anyway? (yes/no): ").strip().lower() + if user_response == 'yes': + print("Proceeding with execution...") + else: + print("Execution terminated due to missing required sheet.") + sys.exit(1) + else: + # Print the error message + print(e) + # Exit the program with a non-zero status code to indicate an error + sys.exit(1) print(f"File {self.file} is validated successfully. Initiating submission") print(f"File {self.file} being uploaded to storage") @@ -283,19 +348,45 @@ def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): self.submission_envelope_id, self.access_token ) - def _submit_expression_alterations(self, submission_instance, parsed_data): + def _submit_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df): """Submit expression alterations.""" - return create_expression_alterations( + return _create_expression_alterations( submission_instance, self.submission_envelope_id, self.access_token, - parsed_data + expression_alterations, expression_alterations_df ) - def _perform_main_submission(self, submission_instance, parsed_data): + def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df): + for cell_line in cell_lines: + cell_line_entity_id = submission_instance.handle_cell_line(cell_line, None, cell_lines_df, + self.submission_envelope_id, self.dataset, + self.access_token, self.action, + self.submission_errors) + cell_line.id = cell_line_entity_id + + return cell_lines, cell_lines_df + + def _create_differentiated_cell_lines(self, submission_instance, differentiated_cell_lines, + differentiated_cell_lines_df): + for differentiated_cell_line in differentiated_cell_lines: + differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, + differentiated_cell_line, + differentiated_cell_lines_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + differentiated_cell_line.id = differentiated_cell_line_entity_id + + return differentiated_cell_lines, differentiated_cell_lines_df + + def _perform_main_submission(self, submission_instance, created_cell_lines, cell_lines_df, + created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data): """Perform the main submission.""" # Unpack the returned values into a list and the message separately updated_dfs, message = submission_instance.multi_type_submission( - parsed_data['cell_lines'], parsed_data['expression_alterations'], parsed_data['cell_lines_df'], - parsed_data['differentiated_cell_lines_df'], parsed_data['library_preparations_df'], + created_cell_lines, parsed_data['expression_alterations'], cell_lines_df, + created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data['library_preparations_df'], parsed_data['sequencing_files_df'], self.submission_envelope_id, self.dataset, self.access_token, self.action, self.submission_errors ) @@ -306,13 +397,20 @@ def _save_and_upload_results(self, updated_dfs, expression_alteration_df): current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file = f"submission_result_{current_time}.xlsx" try: + # List of updated DataFrames and corresponding sheet names + dataframes = [ + (updated_dfs[0], 'Cell line'), + (updated_dfs[1], 'Differentiated cell line'), + (updated_dfs[2], 'Library preparation'), + (updated_dfs[3], 'Sequence file'), + (expression_alteration_df, 'Expression alteration strategy') + ] + + # Create the Excel file and write only non-null DataFrames with pd.ExcelWriter(output_file, engine='openpyxl') as writer: - updated_dfs[0].to_excel(writer, sheet_name='Cell line', index=False) - updated_dfs[1].to_excel(writer, sheet_name='Differentiated cell line', index=False) - updated_dfs[2].to_excel(writer, sheet_name='Library preparation', index=False) - updated_dfs[3].to_excel(writer, sheet_name='Sequence file', index=False) - expression_alteration_df.to_excel(writer, sheet_name='Expression alteration strategy', index=False) - + for df, sheet_name in dataframes: + if df is not None: # Check if the DataFrame is not None + df.to_excel(writer, sheet_name=sheet_name, index=False) if os.path.exists(output_file): CmdUpload(self.aws, self.args).upload_file(self.dataset, output_file, os.path.basename(output_file)) print(f"File {output_file} uploaded successfully.") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index a7863d7..28dad58 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -892,14 +892,29 @@ def parse_expression_alteration(self, sheet_name, action, errors): ----------- sheet_name : str The name of the sheet containing expression alterations data. + action : str + The action to be performed on the data. + errors : list + A list to accumulate error messages. Returns: -------- - list - A list of ExpressionAlterationStrategy objects parsed from the specified sheet. + tuple + A tuple containing: + - A list of ExpressionAlterationStrategy objects parsed from the specified sheet (if valid) + - The filtered DataFrame of the parsed data + - A boolean indicating whether the expression alteration strategy sheet exists and is valid """ - df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + # Attempt to parse the input file into a DataFrame + try: + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + except Exception as e: + errors.append(f"Missing sheet '{sheet_name}': {e}") + return [], None + + # Strip whitespace from column names df.columns = df.columns.str.strip() + # Check if the required column exists required_columns = ['expression_alteration_id'] missing_columns = [col for col in required_columns if col not in df.columns] @@ -907,23 +922,26 @@ def parse_expression_alteration(self, sheet_name, action, errors): if missing_columns: errors.append( f"The following required columns are missing in the Expression Alteration Strategy sheet: {', '.join(missing_columns)}") - return [], df # Return early if required columns are missing + return None, df, False # Return if required columns are missing # Filter rows where 'expression_alteration_id' is not null df = df[df['expression_alteration_id'].notna()] + # Replace invalid float values (e.g., NaN, infinite) with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) - # Define unwanted patterns + + # Define unwanted patterns to filter out unwanted rows unwanted_patterns = ( 'FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the gene expression alteration instance..', 'ID should have no spaces. For example: JAXPE0001_MEIS1, MSKKI119_MEF2C, NWU_AID' ) + # Create a mask to filter out rows with unwanted starting values mask = df['expression_alteration_id'].astype(str).str.startswith(unwanted_patterns) df_filtered = df[~mask] - # Create ExpressionAlterationStrategy objects + # Initialize the list of ExpressionAlterationStrategy objects expression_alterations = [] for _, row in df_filtered.iterrows(): @@ -944,6 +962,7 @@ def parse_expression_alteration(self, sheet_name, action, errors): ) ) + # Return the list of objects, the filtered DataFrame, and a flag indicating success return expression_alterations, df_filtered def get_cell_lines(self, sheet_name, action, errors): @@ -981,9 +1000,8 @@ def get_differentiated_cell_lines(self, sheet_name, action, errors): list A list of DifferentiatedCellLine objects parsed from the specified sheet. """ - differentiated_cell_lines, differentiated_cell_lines_df = (self. - parse_differentiated_cell_lines - (sheet_name, action, errors)) + differentiated_cell_lines, differentiated_cell_lines_df = self.parse_differentiated_cell_lines(sheet_name, + action, errors) return differentiated_cell_lines, differentiated_cell_lines_df def get_library_preparations(self, sheet_name, action, errors): From 1940cdd040dec66b56d0cdef6a05faa93ad925d4 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 2 Sep 2024 14:42:44 +0100 Subject: [PATCH 2/8] code improvements --- ait/commons/util/command/submit.py | 415 ++++++++++++++++-------- ait/commons/util/command/submit_file.py | 138 +++++--- ait/commons/util/command/view.py | 16 +- ait/commons/util/provider_api_util.py | 22 +- 4 files changed, 389 insertions(+), 202 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 397290e..0905775 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -216,7 +216,8 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, submission_envelope_id, dataset_id, + def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, + submission_envelope_id, dataset_id, access_token, action, errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -233,7 +234,7 @@ def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, sub - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) + success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) if success: print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, @@ -313,9 +314,9 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', differentiated_cell_line.id, - differentiated_cell_line.to_dict(), - access_token) + success = self.patch_entity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), + access_token) if success: print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") @@ -424,7 +425,8 @@ def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_en The ID of the differentiation process entity created. """ if action.lower() != 'modify': - print("Cell line has differentiated cell lines, creating differentiation process to link them") + print(f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"to link them") # Create a differentiation process entity differentiation_process_entity_id = self.create_process( @@ -470,9 +472,9 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('biomaterial', library_preparation.id, - library_preparation.to_dict(), - access_token) + success = self.patch_entity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) if success: print(f"Updated library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") @@ -498,54 +500,128 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati def create_library_preparation_entity(self, access_token, dataset_id, differentiated_entity_id, library_preparation, submission_envelope_id): - print(f"Creating Library Preparation for Differentiated Cell Line Biomaterial: " - f"{differentiated_entity_id}") + """ + Creates a Library Preparation entity for the Differentiated Cell Line and links it to the submission envelope and dataset. - library_preparation_entity_id = self.create_child_biomaterial( - differentiated_entity_id, - library_preparation.to_dict(), - access_token - ) + Parameters: + ----------- + access_token : str + The authentication token. + dataset_id : str + The dataset ID to link with. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation : object + The library preparation object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. - print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to envelope: {submission_envelope_id}") + Returns: + -------- + str + The ID of the created library preparation entity. + """ + if differentiated_entity_id is not None: + print( + f"Creating Library Preparation as child of Differentiated Cell Line Biomaterial: {differentiated_entity_id}") - self.link_entity_to_envelope( - 'biomaterial', - library_preparation_entity_id, - submission_envelope_id, - access_token - ) + # Create the library preparation biomaterial + library_preparation_entity_id = self.create_child_biomaterial( + differentiated_entity_id, + library_preparation.to_dict(), + access_token + ) - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"to dataset: {dataset_id}") + print(f"Created Library Preparation Biomaterial: {library_preparation_entity_id}") - self.link_to_dataset('biomaterial', dataset_id, - library_preparation_entity_id, access_token) + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to envelope: {submission_envelope_id}") - print(f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} " - f"as input to library preparation process") - - library_preparation_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('library_preparation'), - submission_envelope_id) - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) + # Link the library preparation to the submission envelope + self.link_entity_to_envelope( + 'biomaterial', + library_preparation_entity_id, + submission_envelope_id, + access_token + ) + else: + print(f"Creating Library preparation Biomaterial: {library_preparation.biomaterial_id}") + library_preparation_entity_id = self.use_existing_envelope_and_submit_entity( + 'biomaterial', + library_preparation.to_dict(), + submission_envelope_id, + access_token + ) - print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} " - f"as derived by library preparation process") + print(f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} to dataset: {dataset_id}") - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) + # Link the library preparation to the dataset + self.link_to_dataset('biomaterial', dataset_id, library_preparation_entity_id, access_token) return library_preparation_entity_id + def link_differentiated_and_library_preparation(self, + access_token, + differentiated_entity_id, + library_preparation_entity_id, + dataset_id, + submission_envelope_id, + action): + """ + Links the Differentiated Cell Line to the Library Preparation through a library preparation process. + + Parameters: + ----------- + access_token : str + The authentication token. + differentiated_entity_id : str + The ID of the differentiated cell line entity. + library_preparation_entity_id : str + The ID of the library preparation entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + + Returns: + -------- + str + The ID of the library preparation process entity created. + """ + if action.lower() != 'modify': + print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + f"preparation process to link them") + + # Create a library preparation process entity + library_preparation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " + f"preparation process") + + # Link the differentiated cell line entity as input to the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " + f"preparation process") + + # Link the library preparation entity as derived by the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + return library_preparation_process_entity_id + def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, dataset_id, access_token, action, errors): @@ -561,67 +637,131 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patchEntity('file', sequencing_file.id, - sequencing_file.to_dict(), - access_token) + success = self.patch_entity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) if success: - print(f"Updated sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + print(f"Updated sequencing file: {sequencing_file.id} / " + f"{sequencing_file.file_name}") + + update_dataframe(sequencing_file_df, sequencing_file.id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') else: errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + + return sequencing_file.id else: - print("Creating sequencing process to link the sequencing file") + sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, + dataset_id, + library_preparation_entity_id, + sequencing_file, + submission_envelope_id) + update_dataframe(sequencing_file_df, sequencing_file_entity_id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') + + return sequencing_file_entity_id + + def create_sequencing_file_entity(self, access_token, dataset_id, library_preparation_entity_id, sequencing_file, + submission_envelope_id): + """ + Creates a Sequencing File entity for the Library Preparation and links it to the submission envelope and dataset. - sequencing_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('sequencing'), - submission_envelope_id) + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file : object + The sequencing file object containing details for creation. + submission_envelope_id : str + The ID of the submission envelope. + dataset_id : str + The dataset ID to link with. + access_token : str + The authentication token. - sequencing_file_entity_id_column_name = "Id" + Returns: + -------- + str + The ID of the created sequencing file entity. + """ - if sequencing_file_entity_id_column_name not in sequencing_file_df.columns: - sequencing_file_df[sequencing_file_entity_id_column_name] = np.nan + print( + f"Creating Sequencing file: {sequencing_file.file_name} as a result of sequencing the Library preparation " + f"biomaterial: {library_preparation_entity_id}") - print(f"Creating Sequencing file: {sequencing_file.file_name} " - f"as a result of sequencing the Library preparation biomaterial: {library_preparation_entity_id}") + sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( + 'file', + sequencing_file.to_dict(), + submission_envelope_id, + access_token + ) - sequencing_file_entity_id = self.use_existing_envelope_and_submit_entity( - 'file', - sequencing_file.to_dict(), - submission_envelope_id, - access_token - ) + print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") + + self.link_to_dataset('file', dataset_id, sequencing_file_entity_id, access_token) - print(f"Created Sequencing file: {sequencing_file_entity_id}") + return sequencing_file_entity_id + + def link_library_preparation_and_sequencing_file(self, + access_token, + library_preparation_entity_id, + sequencing_file_entity_id, + dataset_id, + submission_envelope_id, + action): + """ + Links the Library Preparation to the Sequencing File through a sequencing process. + + Parameters: + ----------- + library_preparation_entity_id : str + The ID of the library preparation entity. + sequencing_file_entity_id : str + The ID of the sequencing file entity. + dataset_id : str + The dataset ID to link with. + submission_envelope_id : str + The ID of the submission envelope. + access_token : str + The authentication token. - print(f"Linking sequencing file: {sequencing_file_entity_id} to dataset: {dataset_id}") + Returns: + -------- + str + The ID of the sequencing process entity created. + """ + if action.lower() != 'modify': + print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." + f"Creating sequencing process to link the sequencing file") - self.link_to_dataset('file', dataset_id, - sequencing_file_entity_id, access_token) + # Create a sequencing process entity + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) - print(f"Linking Library preparation Biomaterial: {library_preparation_entity_id} " - f"as input to process: {sequencing_process_entity_id}") + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + # Link the library preparation entity as input to the sequencing process self.perform_hal_linkage( f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", sequencing_process_entity_id, 'processes', access_token ) - print(f"Linking Sequencing file: {sequencing_file_entity_id} " - f"as derived by process: {sequencing_process_entity_id}") + print( + f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + # Link the sequencing file entity as derived by the sequencing process self.perform_hal_linkage( f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", sequencing_process_entity_id, 'processes', access_token ) - sequencing_file_df[sequencing_file_entity_id_column_name] = sequencing_file_df[ - sequencing_file_entity_id_column_name].astype(object) - - sequencing_file_df.loc[ - sequencing_file_df['sequence_file.file_core.file_name'] == sequencing_file.file_name, - sequencing_file_entity_id_column_name - ] = sequencing_file_entity_id + return sequencing_process_entity_id def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -638,19 +778,20 @@ def create_process(self, access_token, dataset_id, process_data, submission_enve return process_entity_id - def multi_type_submission(self, - cell_lines, - expression_alterations, - cell_lines_df, - differentiated_cell_lines, - differentiated_cell_lines_df, - library_preparations_df, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors): + def establish_links(self, + cell_lines, + cell_lines_df, + differentiated_cell_lines, + differentiated_cell_lines_df, + library_preparations, + library_preparations_df, + sequencing_files, + sequencing_files_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles the submission of multiple types of biomaterials (cell lines, differentiated cell lines, library preparations) @@ -670,38 +811,34 @@ def multi_type_submission(self, """ try: for cell_line in cell_lines: - - if cell_line.id is not None: - cell_line_entity_id = cell_line.id - - for differentiated_cell_line in cell_line.differentiated_cell_lines: - differentiated_cell_line_entity_id = differentiated_cell_line.id - - self.link_cell_line_and_differentiated_cell_line(access_token, cell_line_entity_id, - differentiated_cell_line_entity_id, - dataset_id, submission_envelope_id - , action) - - for library_preparation in differentiated_cell_line.library_preparations: - library_preparation_entity_id = self.handle_library_preparation( - differentiated_cell_line_entity_id, - library_preparation, - library_preparations_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) - - for sequencing_file in library_preparation.sequencing_files: - self.handle_sequencing_file(library_preparation_entity_id, - sequencing_file, - sequencing_file_df, - submission_envelope_id, - dataset_id, - access_token, - action, - errors) + for differentiated_cell_line in differentiated_cell_lines: + if cell_line.biomaterial_id == differentiated_cell_line.input_biomaterial_id: + self.link_cell_line_and_differentiated_cell_line(access_token, + cell_line.id, + differentiated_cell_line.id, + dataset_id, + submission_envelope_id, + action) + for differentiated_cell_line in differentiated_cell_lines: + for library_preparation in library_preparations: + if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: + self.link_differentiated_and_library_preparation( + access_token, + differentiated_cell_line.id, + library_preparation.id, + dataset_id, + submission_envelope_id, + action) + + for library_preparation in library_preparations: + for sequencing_file in sequencing_files: + if library_preparation.biomaterial_id == sequencing_file.library_preparation_id: + self.link_library_preparation_and_sequencing_file(access_token, + library_preparation.id, + sequencing_file.id, + dataset_id, + submission_envelope_id, + action) message = 'SUCCESS' except Exception as e: @@ -712,12 +849,12 @@ def multi_type_submission(self, cell_lines_df = None differentiated_cell_lines_df = None library_preparations_df = None - sequencing_file_df = None + sequencing_files_df = None return ([cell_lines_df, differentiated_cell_lines_df, library_preparations_df, - sequencing_file_df], message) + sequencing_files_df], message) def typed_submission(self, type, file, access_token): """ @@ -801,7 +938,7 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ return entity_id - def patchEntity(self, input_entity_type, id, data, access_token): + def patch_entity(self, input_entity_type, id, data, access_token): entity_map = { 'study': 'studies', 'dataset': 'datasets', @@ -829,7 +966,7 @@ def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token return False put_url = f"{self.base_url}/datasets/{dataset_id}/{hal_entity}/{entity_id}" - return self.provider_api.put_to_provider_api(put_url, access_token) + return self.provider_api.put(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): headers = { @@ -885,7 +1022,7 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): print(f"Linking dataset {dataset_id} to study {study_id}") url = f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) print(f"Dataset linked successfully to study: {study_id}") @@ -901,7 +1038,7 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") url = f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) print(f"Biomaterial linked successfully to dataset: {dataset_id}") @@ -987,10 +1124,10 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces """ if type == 'biomaterial': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) elif type == 'file': url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" - self.provider_api.put_to_provider_api(url, access_token) + self.provider_api.put(url, access_token) def delete_dataset(self, dataset, access_token): """ @@ -1000,7 +1137,7 @@ def delete_dataset(self, dataset, access_token): dataset (str): The ID of the dataset to delete. access_token (str): Access token for authorization. """ - fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) + fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{dataset}", access_token) print(f"Dataset fetched successfully: {dataset}") print(f"Initiating delete of {dataset}") @@ -1011,17 +1148,17 @@ def delete_dataset(self, dataset, access_token): print("Deleting Biomaterials:") for biomaterial in biomaterials: print(f"Deleting {biomaterial}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", access_token) + self.provider_api.delete(f"{self.base_url}/biomaterials/{biomaterial}", access_token) print("\nDeleting Processes:") for process in processes: print(f"Deleting {process}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/processes/{process}", access_token) + self.provider_api.delete(f"{self.base_url}/processes/{process}", access_token) print("\nDeleting Data Files:") for data_file in data_files: print(f"Deleting {data_file}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/files/{data_file}", access_token) + self.provider_api.delete(f"{self.base_url}/files/{data_file}", access_token) print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete_to_provider_api(f"{self.base_url}/datasets/{dataset}", access_token) + self.provider_api.delete(f"{self.base_url}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index f3d7f6a..62e986a 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -175,13 +175,18 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) cell_lines_df = parsed_data['cell_lines_df'] differentiated_cell_lines = parsed_data['differentiated_cell_lines'] differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + library_preparations = parsed_data['library_preparations'] + library_preparations_df = parsed_data['library_preparations_df'] + sequencing_files = parsed_data['sequencing_files'] + sequencing_files_df = parsed_data['sequencing_files_df'] # TODO: Handle expression alterations in MODIFY + created_expression_alterations = [] + if self._is_add_action(): - self._create_submission_envelope(submission_instance) + self._create_submission_envelope() parent_cell_line_id = None - created_expression_alterations = [] if parent_cell_line_name is not None: print(f"Creating parental cell line with name {parent_cell_line_name}") @@ -199,47 +204,44 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations ) - created_cell_lines = [] + created_cell_lines = [] - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df) + if cell_lines and cell_lines_df is not None: + created_cell_lines, cell_lines_df = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df, created_expression_alterations) - created_differentiated_cell_lines = [] + created_differentiated_cell_lines = [] - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) - updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, - cell_lines_df, created_differentiated_cell_lines, - differentiated_cell_lines_df, parsed_data) + created_library_preparations = [] - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) - elif self._is_modify_action(): - created_cell_lines = [] - - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df) + if library_preparations and library_preparations_df is not None: + created_library_preparations, library_preparations_df = self._create_library_preparations( + submission_instance, library_preparations, library_preparations_df) - created_differentiated_cell_lines = [] + created_sequencing_files = [] - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if sequencing_files and sequencing_files_df is not None: + created_sequencing_files, sequencing_files_df = self._create_sequencing_files( + submission_instance, sequencing_files, sequencing_files_df) - updated_dfs, message = self._perform_main_submission(submission_instance, created_cell_lines, - cell_lines_df, created_differentiated_cell_lines, - differentiated_cell_lines_df, parsed_data) + updated_dfs, message = self._establish_links(submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df) - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self.delete_actions(self.submission_envelope_id, submission_instance, None) def _parse_spreadsheet(self, parser): try: @@ -330,7 +332,7 @@ def _is_modify_action(self): """Check if the current action is 'MODIFY'.""" return self.action.lower() == 'modify' - def _create_submission_envelope(self, submission_instance): + def _create_submission_envelope(self): """Create a new submission envelope.""" submission_envelope_response, status_code = create_new_submission_envelope( self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token @@ -355,9 +357,9 @@ def _submit_expression_alterations(self, submission_instance, expression_alterat expression_alterations, expression_alterations_df ) - def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df): + def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, expression_alterations): for cell_line in cell_lines: - cell_line_entity_id = submission_instance.handle_cell_line(cell_line, None, cell_lines_df, + cell_line_entity_id = submission_instance.handle_cell_line(cell_line, expression_alterations, cell_lines_df, self.submission_envelope_id, self.dataset, self.access_token, self.action, self.submission_errors) @@ -380,16 +382,64 @@ def _create_differentiated_cell_lines(self, submission_instance, differentiated_ return differentiated_cell_lines, differentiated_cell_lines_df - def _perform_main_submission(self, submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data): + def _create_library_preparations(self, submission_instance, library_preparations, + library_preparations_df): + for library_preparation in library_preparations: + library_preparation_entity_id = submission_instance.handle_library_preparation(None, + library_preparation, + library_preparations_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + library_preparation.id = library_preparation_entity_id + + return library_preparations, library_preparations_df + + def _create_sequencing_files(self, submission_instance, sequencing_files, + sequencing_files_df): + for sequencing_file in sequencing_files: + sequencing_file_entity_id = submission_instance.handle_sequencing_file(None, + sequencing_file, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors) + sequencing_file.id = sequencing_file_entity_id + + return sequencing_files, sequencing_files_df + + def _establish_links(self, + submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df): """Perform the main submission.""" # Unpack the returned values into a list and the message separately - updated_dfs, message = submission_instance.multi_type_submission( - created_cell_lines, parsed_data['expression_alterations'], cell_lines_df, - created_differentiated_cell_lines, differentiated_cell_lines_df, parsed_data['library_preparations_df'], - parsed_data['sequencing_files_df'], self.submission_envelope_id, - self.dataset, self.access_token, self.action, self.submission_errors + updated_dfs, message = submission_instance.establish_links( + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df, + self.submission_envelope_id, + self.dataset, + self.access_token, + self.action, + self.submission_errors ) + return updated_dfs, message def _save_and_upload_results(self, updated_dfs, expression_alteration_df): diff --git a/ait/commons/util/command/view.py b/ait/commons/util/command/view.py index adf1a89..aa8fc74 100644 --- a/ait/commons/util/command/view.py +++ b/ait/commons/util/command/view.py @@ -18,8 +18,8 @@ def __init__(self, args): print("Dataset is mandatory for view") def run(self): - fetched_dataset = self.provider_api.get_to_provider_api(f"{self.base_url}/datasets/{self.dataset}", - self.access_token) + fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{self.dataset}", + self.access_token) print(f"Dataset fetched successfully: {self.dataset}") print("Getting Biomaterials") biomaterials = fetched_dataset.get('biomaterials', []) @@ -27,8 +27,8 @@ def run(self): for biomaterial in biomaterials: print(biomaterial) - fetched_biomaterial = self.provider_api.get_to_provider_api(f"{self.base_url}/biomaterials/{biomaterial}", - self.access_token) + fetched_biomaterial = self.provider_api.get(f"{self.base_url}/biomaterials/{biomaterial}", + self.access_token) print(fetched_biomaterial) print("Getting Processes") @@ -37,8 +37,8 @@ def run(self): for process in processes: print(process) - fetched_process = self.provider_api.get_to_provider_api(f"{self.base_url}/processes/{process}", - self.access_token) + fetched_process = self.provider_api.get(f"{self.base_url}/processes/{process}", + self.access_token) print(fetched_process) print("Getting Data Files") @@ -47,8 +47,8 @@ def run(self): for file in files: print(files) - fetched_file = self.provider_api.get_to_provider_api(f"{self.base_url}/files/{file}", - self.access_token) + fetched_file = self.provider_api.get(f"{self.base_url}/files/{file}", + self.access_token) print(fetched_file) return True, "FETCHED SUCCESSFULLY" diff --git a/ait/commons/util/provider_api_util.py b/ait/commons/util/provider_api_util.py index 774dad6..851b052 100644 --- a/ait/commons/util/provider_api_util.py +++ b/ait/commons/util/provider_api_util.py @@ -5,7 +5,7 @@ class APIProvider: def __init__(self, base_url): self.base_url = base_url - def send_request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): + def request(self, method, url, access_token, params=None, data=None, data_type_in_hal_link=None): """ Sends an HTTP request to the specified URL with the given method. @@ -74,18 +74,18 @@ def send_request(self, method, url, access_token, params=None, data=None, data_t # Return the JSON-parsed response data for other successful requests return response.json() - def put_to_provider_api(self, url, access_token): - return self.send_request('PUT', url, access_token) + def put(self, url, access_token): + return self.request('PUT', url, access_token) - def get_to_provider_api(self, url, access_token): - return self.send_request('GET', url, access_token) + def get(self, url, access_token): + return self.request('GET', url, access_token) - def delete_to_provider_api_including_linked_entities(self, url, access_token, delete_linked_entities=False): + def delete_with_relations(self, url, access_token, delete_linked_entities=False): params = {'deleteLinkedEntities': str(delete_linked_entities).lower()} - return self.send_request('DELETE', url, access_token, params=params) + return self.request('DELETE', url, access_token, params=params) - def delete_to_provider_api(self, url, access_token): - return self.send_request('DELETE', url, access_token) + def delete(self, url, access_token): + return self.request('DELETE', url, access_token) - def post_to_provider_api(self, url, data_type_in_hal_link, data, access_token): - return self.send_request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) + def post(self, url, data_type_in_hal_link, data, access_token): + return self.request('POST', url, access_token, data=data, data_type_in_hal_link=data_type_in_hal_link) From 03976d6d4a43025167f7d57c998ec16173f8f2ab Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Tue, 3 Sep 2024 11:35:39 +0100 Subject: [PATCH 3/8] handling md5 checksums and new type of sheet having clonal and undifferentiated --- ait/commons/util/command/list.py | 27 ++++++++++++++------ ait/commons/util/command/submit_file.py | 34 ++++++++++++++++++++++--- ait/commons/util/command/upload.py | 17 ++++++++++++- 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py index c28c7a7..ef5261e 100644 --- a/ait/commons/util/command/list.py +++ b/ait/commons/util/command/list.py @@ -10,6 +10,10 @@ def print_area(k, area): p = area.get('perms') or '' print(p.ljust(3), end=' ') + if 'md5' in area: + p = area.get('md5') or '' + print(p.ljust(3), end=' ') + if 'name' in area: n = area.get('name') print(f'{n}' if n else '', end=' ') @@ -44,13 +48,12 @@ def run(self): def list_bucket_contents(self, selected_area, prefix=''): result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix) - # Folders dirs = result.get('CommonPrefixes', []) for d in dirs: k = d.get('Prefix') - print_area(k, {'key': k, 'perms': 'dir'}) + print_area(k, {'key': k, 'md5': None, 'perms': 'dir'}) self.list_bucket_contents(selected_area, prefix=k) # Files @@ -58,7 +61,10 @@ def list_bucket_contents(self, selected_area, prefix=''): for f in files: k = f.get('Key') - print_area(k, {'key': k, 'perms': 'file'}) + head_object_response = self.s3_cli.head_object(Bucket=selected_area, Key=k) + metadata = head_object_response.get('Metadata', {}) + hash_md5 = metadata.get('md5', 'MD5 checksum not found') + print_area(k, {'key': k, 'md5': hash_md5, 'perms': 'file'}) def list_bucket_contents_and_return(self, selected_area, prefix=''): """ @@ -71,26 +77,31 @@ def list_bucket_contents_and_return(self, selected_area, prefix=''): Returns: - A list of file keys in the bucket. """ - file_keys = [] + file_keys = [] # Initialize an empty list to store file keys. + # Define the recursive function to list bucket contents. def _list_bucket_contents(bucket, prefix): + # Call AWS S3 API to list objects with a specific prefix. result = self.s3_cli.list_objects_v2(Bucket=bucket, Delimiter='/', Prefix=prefix) - # Folders + # Handle directories (folders) first. dirs = result.get('CommonPrefixes', []) for d in dirs: k = d.get('Prefix') - # print_area(k, {'key': k, 'perms': 'dir'}) + # Recursively call the function to list contents of the subdirectory. _list_bucket_contents(bucket, prefix=k) - # Files + # Handle files at the current prefix level. files = result.get('Contents', []) for f in files: k = f.get('Key') - # print_area(k, {'key': k, 'perms': 'file'}) + # Add each file key to the list. file_keys.append(k) + # Start the recursive process to list all contents from the given prefix. _list_bucket_contents(selected_area, prefix) + + # Return the final list of all file keys found in the bucket. return file_keys diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 62e986a..aaf8f7e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -182,7 +182,7 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) # TODO: Handle expression alterations in MODIFY created_expression_alterations = [] - + if self._is_add_action(): self._create_submission_envelope() @@ -245,26 +245,54 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) def _parse_spreadsheet(self, parser): try: + tab_names = parser.list_sheets() + cell_line_sheet_name = None + differentiated_cell_line_sheet_name = None + + if "Cell line" in tab_names: + cell_line_sheet_name = "Cell line" + elif "Clonal cell line" in tab_names: + cell_line_sheet_name = "Clonal cell line" + else: + self.validation_errors.append("Spreadsheet must contain a " + "'Cell line' or 'Clonal cell line' sheet.") + + if "Differentiated cell line" in tab_names: + differentiated_cell_line_sheet_name = "Differentiated cell line" + # elif "Undifferentiated product" in tab_names: + # differentiated_cell_line_sheet_name = "Undifferentiated product" + elif "Differentiated product" in tab_names: + differentiated_cell_line_sheet_name = "Differentiated product" + else: + self.validation_errors.append("Spreadsheet must contain a " + "'Differentiated cell line' or 'Undifferentiated product' " + "or 'Differentiated product' sheet.") + """Parse the spreadsheet into different sections.""" expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( - 'Cell line', self.action, self.validation_errors + cell_line_sheet_name, self.action, self.validation_errors ) + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - 'Differentiated cell line', self.action, self.validation_errors + differentiated_cell_line_sheet_name, self.action, self.validation_errors ) + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) + merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, self.validation_errors) sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) return { diff --git a/ait/commons/util/command/upload.py b/ait/commons/util/command/upload.py index 50eb4ea..5285522 100755 --- a/ait/commons/util/command/upload.py +++ b/ait/commons/util/command/upload.py @@ -1,3 +1,4 @@ +import hashlib import os import filetype @@ -9,6 +10,16 @@ from ait.commons.util.progress_bar import ProgressBar +def compute_md5(file_path): + """Compute the MD5 hash of the file.""" + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + class CmdUpload: """ admin and user @@ -21,6 +32,8 @@ def __init__(self, aws, args): self.files = [] def upload_file(self, selected_area, data_file, destination_file): + hash_md5 = compute_md5(data_file) + print(f"MD5 hash of {data_file} is {hash_md5}") overwrite = getattr(self.args, 'o', False) file_size = os.path.getsize(data_file) @@ -45,7 +58,9 @@ def upload_file(self, selected_area, data_file, destination_file): s3.Bucket(selected_area).upload_file(Filename=data_file, Key=destination_file, Callback=ProgressBar(target=data_file, total=file_size), - ExtraArgs={'ContentType': content_type} + ExtraArgs={'ContentType': content_type, + 'Metadata': {'md5': hash_md5} + } ) def upload_files(self, data_files, prefix): From 15b5e900ec488f4eabf5b4c3b506bd7031b207df Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 4 Sep 2024 16:01:39 +0100 Subject: [PATCH 4/8] better error handling --- ait/commons/util/command/submit.py | 493 ++++++++++++++---------- ait/commons/util/command/submit_file.py | 309 +++++++++------ ait/commons/util/spreadsheet_util.py | 285 ++++++++++++-- 3 files changed, 734 insertions(+), 353 deletions(-) diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 0905775..6d0b5c5 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -7,6 +7,7 @@ import numpy as np from urllib.parse import urlparse +from ait.commons.util.spreadsheet_util import SubmissionError from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider @@ -216,9 +217,15 @@ def run(self): """ return self.typed_submission(self.type, self.file, self.access_token) - def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, - submission_envelope_id, dataset_id, - access_token, action, errors): + def handle_cell_line(self, + cell_line, + expression_alterations, + cell_lines_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Submits a cell line as a biomaterial entity to a specified submission envelope. @@ -234,22 +241,34 @@ def handle_cell_line(self, cell_line, expression_alterations, cell_lines_df, - cell_line_entity_id: Entity ID of the submitted or modified cell line biomaterial. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) - if success: - print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") - update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', cell_line.id, cell_line.to_dict(), access_token) + if success: + print(f"Updated cell line: {cell_line.id} / {cell_line.biomaterial_id}") + update_dataframe(cell_lines_df, cell_line.id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') + return cell_line.id + else: + errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update cell line: {cell_line.id} / {cell_line.biomaterial_id}") - return cell_line.id + raise SubmissionError(errors, e) else: - cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, - submission_envelope_id, dataset_id, access_token) - update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, - 'cell_line.biomaterial_core.biomaterial_id') - return cell_line_entity_id - - def create_cell_line_entity(self, cell_line, expression_alterations, submission_envelope_id, + try: + cell_line_entity_id = self.create_cell_line_entity(cell_line, expression_alterations, + submission_envelope_id, dataset_id, access_token) + update_dataframe(cell_lines_df, cell_line_entity_id, cell_line.biomaterial_id, + 'cell_line.biomaterial_core.biomaterial_id') + return cell_line_entity_id + except Exception as e: + errors.append(f"Failed to create cell line: {cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_cell_line_entity(self, + cell_line, + expression_alterations, + submission_envelope_id, dataset_id, access_token): """ @@ -284,7 +303,10 @@ def create_cell_line_entity(self, cell_line, expression_alterations, submission_ return cell_line_entity_id - def link_cell_line_with_expression_alterations(self, access_token, cell_line, cell_line_entity_id, + def link_cell_line_with_expression_alterations(self, + access_token, + cell_line, + cell_line_entity_id, expression_alterations): for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: @@ -297,9 +319,15 @@ def link_cell_line_with_expression_alterations(self, access_token, cell_line, ce expression_alteration.id, 'processes', access_token ) - def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_cell_line, - differentiated_cell_lines_df, submission_envelope_id, dataset_id, - access_token, action, errors): + def handle_differentiated_cell_line(self, + cell_line_entity_id, + differentiated_cell_line, + differentiated_cell_lines_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles a single differentiated cell line associated with a given cell line. @@ -314,32 +342,48 @@ def handle_differentiated_cell_line(self, cell_line_entity_id, differentiated_ce - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', differentiated_cell_line.id, - differentiated_cell_line.to_dict(), - access_token) - if success: - print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " - f"{differentiated_cell_line.biomaterial_id}") - - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, - differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', differentiated_cell_line.id, + differentiated_cell_line.to_dict(), + access_token) + if success: + print(f"Updated differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line.id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') + return differentiated_cell_line.id + else: + errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " + f"{differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update differentiated cell line: {differentiated_cell_line.id} / " f"{differentiated_cell_line.biomaterial_id}") - return differentiated_cell_line.id + raise SubmissionError(errors, e) + else: - differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, cell_line_entity_id, - dataset_id, - differentiated_cell_line, - submission_envelope_id) - update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, - differentiated_cell_line.biomaterial_id, - 'differentiated_cell_line.biomaterial_core.biomaterial_id') - return differentiated_cell_line_id - - def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_id, dataset_id, - differentiated_cell_line, submission_envelope_id): + try: + differentiated_cell_line_id = self.create_differentiated_cell_line_entity(access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id) + update_dataframe(differentiated_cell_lines_df, differentiated_cell_line_id, + differentiated_cell_line.biomaterial_id, + 'differentiated_cell_line.biomaterial_core.biomaterial_id') + return differentiated_cell_line_id + except Exception as e: + errors.append(f"Failed to create differentiated cell line: {differentiated_cell_line.biomaterial_id}") + raise SubmissionError(errors, e) + + def create_differentiated_cell_line_entity(self, + access_token, + cell_line_entity_id, + dataset_id, + differentiated_cell_line, + submission_envelope_id): """ Creates a Differentiated Cell Line entity and links it to the submission envelope. @@ -401,8 +445,14 @@ def create_differentiated_cell_line_entity(self, access_token, cell_line_entity_ return differentiated_entity_id - def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_entity_id, differentiated_entity_id, - dataset_id, submission_envelope_id, action): + def link_cell_line_and_differentiated_cell_line(self, + access_token, + cell_line_entity_id, + differentiated_entity_id, + dataset_id, + submission_envelope_id, + action, + errors): """ Creates and links the differentiation process between the original cell line and the differentiated cell line. @@ -425,40 +475,52 @@ def link_cell_line_and_differentiated_cell_line(self, access_token, cell_line_en The ID of the differentiation process entity created. """ if action.lower() != 'modify': - print(f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " - f"to link them") - - # Create a differentiation process entity - differentiation_process_entity_id = self.create_process( - access_token, - dataset_id, - get_process_content('differentiation'), - submission_envelope_id - ) - - print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") - - # Link the cell line entity as input to the differentiation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") - - # Link the differentiated cell line entity as derived by the differentiation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", - differentiation_process_entity_id, 'processes', access_token - ) - - return differentiation_process_entity_id - - def handle_library_preparation(self, differentiated_entity_id, library_preparation, - library_preparations_df, submission_envelope_id, - dataset_id, access_token, action, errors): + try: + print( + f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"to link them") + + # Create a differentiation process entity + differentiation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('differentiation'), + submission_envelope_id + ) + + print( + f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + + # Link the cell line entity as input to the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " + f"as derived by process : {differentiation_process_entity_id}") + + # Link the differentiated cell line entity as derived by the differentiation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + differentiation_process_entity_id, 'processes', access_token + ) + + return differentiation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Cell line {cell_line_entity_id} and Differentiated cell line {differentiated_entity_id}") + raise SubmissionError(errors, e) + + def handle_library_preparation(self, + differentiated_entity_id, + library_preparation, + library_preparations_df, + submission_envelope_id, + dataset_id, + access_token, + action, + errors): """ Handles a single library preparation associated with a given differentiated cell line. @@ -472,33 +534,46 @@ def handle_library_preparation(self, differentiated_entity_id, library_preparati - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('biomaterial', library_preparation.id, - library_preparation.to_dict(), - access_token) - if success: - print(f"Updated library preparation biomaterial: {library_preparation.id} / " - f"{library_preparation.biomaterial_id}") - - update_dataframe(library_preparations_df, library_preparation.id, - library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') - else: + try: + success = self.patch_entity('biomaterial', library_preparation.id, + library_preparation.to_dict(), + access_token) + if success: + print(f"Updated library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + + update_dataframe(library_preparations_df, library_preparation.id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') + return library_preparation.id + else: + errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " + f"{library_preparation.biomaterial_id}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update library preparation biomaterial: {library_preparation.id} / " f"{library_preparation.biomaterial_id}") - - return library_preparation.id + raise SubmissionError(errors, e) else: - library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, - differentiated_entity_id, - library_preparation, - submission_envelope_id) - update_dataframe(library_preparations_df, library_preparation_entity_id, - library_preparation.biomaterial_id, - 'library_preparation.biomaterial_core.biomaterial_id') + try: + library_preparation_entity_id = self.create_library_preparation_entity(access_token, dataset_id, + differentiated_entity_id, + library_preparation, + submission_envelope_id) + update_dataframe(library_preparations_df, library_preparation_entity_id, + library_preparation.biomaterial_id, + 'library_preparation.biomaterial_core.biomaterial_id') - return library_preparation_entity_id + return library_preparation_entity_id + except Exception as e: + errors.append(f"Failed to create library preparation biomaterial: {library_preparation.biomaterial_id}") + raise SubmissionError(errors, e) - def create_library_preparation_entity(self, access_token, dataset_id, differentiated_entity_id, library_preparation, + def create_library_preparation_entity(self, + access_token, + dataset_id, + differentiated_entity_id, + library_preparation, submission_envelope_id): """ Creates a Library Preparation entity for the Differentiated Cell Line and links it to the submission envelope and dataset. @@ -566,7 +641,8 @@ def link_differentiated_and_library_preparation(self, library_preparation_entity_id, dataset_id, submission_envelope_id, - action): + action, + errors): """ Links the Differentiated Cell Line to the Library Preparation through a library preparation process. @@ -589,38 +665,45 @@ def link_differentiated_and_library_preparation(self, The ID of the library preparation process entity created. """ if action.lower() != 'modify': - print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " - f"preparation process to link them") - - # Create a library preparation process entity - library_preparation_process_entity_id = self.create_process( - access_token, - dataset_id, - get_process_content('library_preparation'), - submission_envelope_id - ) - - print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " - f"preparation process") - - # Link the differentiated cell line entity as input to the library preparation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " - f"preparation process") - - # Link the library preparation entity as derived by the library preparation process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", - library_preparation_process_entity_id, 'processes', access_token - ) - - return library_preparation_process_entity_id + try: + print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + f"preparation process to link them") + + # Create a library preparation process entity + library_preparation_process_entity_id = self.create_process( + access_token, + dataset_id, + get_process_content('library_preparation'), + submission_envelope_id + ) + + print( + f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " + f"preparation process") + + # Link the differentiated cell line entity as input to the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " + f"preparation process") + + # Link the library preparation entity as derived by the library preparation process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + library_preparation_process_entity_id, 'processes', access_token + ) + + return library_preparation_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Differentiated Cell line " + f"{differentiated_entity_id} and Library preparation" + f" {library_preparation_entity_id}") + raise SubmissionError(errors, e) def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, sequencing_file_df, submission_envelope_id, dataset_id, @@ -637,32 +720,41 @@ def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, - access_token: Access token for authentication and authorization. """ if action.lower() == 'modify': - success = self.patch_entity('file', sequencing_file.id, - sequencing_file.to_dict(), - access_token) - - if success: - print(f"Updated sequencing file: {sequencing_file.id} / " - f"{sequencing_file.file_name}") - - update_dataframe(sequencing_file_df, sequencing_file.id, - sequencing_file.file_name, - 'sequence_file.file_core.file_name') - else: + try: + success = self.patch_entity('file', sequencing_file.id, + sequencing_file.to_dict(), + access_token) + + if success: + print(f"Updated sequencing file: {sequencing_file.id} / " + f"{sequencing_file.file_name}") + + update_dataframe(sequencing_file_df, sequencing_file.id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') + return sequencing_file.id + else: + errors.append( + f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") + raise SubmissionError(errors) + except Exception as e: errors.append(f"Failed to update sequencing file: {sequencing_file.id} / {sequencing_file.file_name}") - - return sequencing_file.id + raise SubmissionError(errors, e) else: - sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, - dataset_id, - library_preparation_entity_id, - sequencing_file, - submission_envelope_id) - update_dataframe(sequencing_file_df, sequencing_file_entity_id, - sequencing_file.file_name, - 'sequence_file.file_core.file_name') + try: + sequencing_file_entity_id = self.create_sequencing_file_entity(access_token, + dataset_id, + library_preparation_entity_id, + sequencing_file, + submission_envelope_id) + update_dataframe(sequencing_file_df, sequencing_file_entity_id, + sequencing_file.file_name, + 'sequence_file.file_core.file_name') - return sequencing_file_entity_id + return sequencing_file_entity_id + except Exception as e: + errors.append(f"Failed to create Sequencing file: {sequencing_file.file_name}") + raise SubmissionError(errors, e) def create_sequencing_file_entity(self, access_token, dataset_id, library_preparation_entity_id, sequencing_file, submission_envelope_id): @@ -711,7 +803,8 @@ def link_library_preparation_and_sequencing_file(self, sequencing_file_entity_id, dataset_id, submission_envelope_id, - action): + action, + errors): """ Links the Library Preparation to the Sequencing File through a sequencing process. @@ -734,34 +827,41 @@ def link_library_preparation_and_sequencing_file(self, The ID of the sequencing process entity created. """ if action.lower() != 'modify': - print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." - f"Creating sequencing process to link the sequencing file") - - # Create a sequencing process entity - sequencing_process_entity_id = self.create_process(access_token, - dataset_id, - get_process_content('sequencing'), - submission_envelope_id) - - print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") - - # Link the library preparation entity as input to the sequencing process - self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - print( - f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") - - # Link the sequencing file entity as derived by the sequencing process - self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", - sequencing_process_entity_id, 'processes', access_token - ) - - return sequencing_process_entity_id + try: + print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." + f"Creating sequencing process to link the sequencing file") + + # Create a sequencing process entity + sequencing_process_entity_id = self.create_process(access_token, + dataset_id, + get_process_content('sequencing'), + submission_envelope_id) + + print( + f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + + # Link the library preparation entity as input to the sequencing process + self.perform_hal_linkage( + f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + print( + f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + + # Link the sequencing file entity as derived by the sequencing process + self.perform_hal_linkage( + f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + sequencing_process_entity_id, 'processes', access_token + ) + + return sequencing_process_entity_id + except Exception as e: + errors.append( + f"Failed to update relations between Library Preparation " + f"{library_preparation_entity_id} and Sequencing file" + f" {sequencing_file_entity_id}") + raise SubmissionError(errors, e) def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): process_entity_id = self.use_existing_envelope_and_submit_entity( @@ -818,7 +918,8 @@ def establish_links(self, differentiated_cell_line.id, dataset_id, submission_envelope_id, - action) + action, + errors) for differentiated_cell_line in differentiated_cell_lines: for library_preparation in library_preparations: if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: @@ -828,7 +929,8 @@ def establish_links(self, library_preparation.id, dataset_id, submission_envelope_id, - action) + action, + errors) for library_preparation in library_preparations: for sequencing_file in sequencing_files: @@ -838,18 +940,19 @@ def establish_links(self, sequencing_file.id, dataset_id, submission_envelope_id, - action) + action, + errors) message = 'SUCCESS' except Exception as e: message = f"An error occurred: {str(e)}" errors.append(message) - traceback.print_exc() + raise SubmissionError(message, e) # Set DataFrames to None in case of an error - cell_lines_df = None - differentiated_cell_lines_df = None - library_preparations_df = None - sequencing_files_df = None + # cell_lines_df = None + # differentiated_cell_lines_df = None + # library_preparations_df = None + # sequencing_files_df = None return ([cell_lines_df, differentiated_cell_lines_df, diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index aaf8f7e..9079a4e 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -13,11 +13,14 @@ from ait.commons.util.provider_api_util import APIProvider from ait.commons.util.spreadsheet_util import SpreadsheetSubmitter, ValidationError, \ merge_library_preparation_sequencing_file, merge_cell_line_and_differentiated_cell_line, \ - merge_differentiated_cell_line_and_library_preparation + merge_differentiated_cell_line_and_library_preparation, SubmissionError # Define a class for handling submission of a command file -def validate_sequencing_files(sequencing_files, list_of_files_in_upload_area, dataset, errors): +def validate_sequencing_files(sequencing_files, + list_of_files_in_upload_area, + dataset, + errors): for sequencing_file in sequencing_files: match_found = False # Flag to indicate if a match is found @@ -37,7 +40,9 @@ def get_content(unique_value): return {"content": unique_value} -def _create_expression_alterations(submission_instance, submission_envelope_id, access_token, +def _create_expression_alterations(submission_instance, + submission_envelope_id, + access_token, expression_alterations, expression_alterations_df): expression_alterations_entity_id_column_name = "Id" @@ -64,7 +69,7 @@ def _create_expression_alterations(submission_instance, submission_envelope_id, expression_alterations_entity_id_column_name ] = expression_alteration_id - return expression_alterations, expression_alterations_df + return expression_alterations class CmdSubmitFile: @@ -136,16 +141,16 @@ def run(self): self._process_submission(submission_instance, list_of_files_in_upload_area) return True, "SUBMISSION IS SUCCESSFUL." except Exception as e: - return self.delete_actions(self.submission_envelope_id, submission_instance, e) + return self._delete_actions(self.submission_envelope_id, submission_instance, e) except KeyboardInterrupt: # Handle the interruption and exit gracefully print("\nProcess interrupted by user. Exiting gracefully...") - self.delete_actions(self.submission_envelope_id, submission_instance, None) + self._delete_actions(self.submission_envelope_id, submission_instance, None) sys.exit(0) # Exit with a zero status code indicating a clean exit except Exception as e: # Handle any other unexpected exceptions print(f"An unexpected error occurred: {str(e)}") - self.delete_actions(self.submission_envelope_id, submission_instance, None) + self._delete_actions(self.submission_envelope_id, submission_instance, None) sys.exit(1) # Exit with a non-zero status code indicating an error def _is_delete_action(self): @@ -164,137 +169,177 @@ def _list_files_in_upload_area(self): return list_instance.list_bucket_contents_and_return(self.dataset, '') def _process_submission(self, submission_instance, list_of_files_in_upload_area): - """Process the file submission.""" - parser = SpreadsheetSubmitter(self.file) - parsed_data = self._parse_spreadsheet(parser) - self._validate_and_upload(parsed_data, list_of_files_in_upload_area) - expression_alterations = parsed_data['expression_alterations'] - expression_alterations_df = parsed_data['expression_alterations_df'] - parent_cell_line_name = parsed_data['parent_cell_line_name'] - cell_lines = parsed_data['cell_lines'] - cell_lines_df = parsed_data['cell_lines_df'] - differentiated_cell_lines = parsed_data['differentiated_cell_lines'] - differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] - library_preparations = parsed_data['library_preparations'] - library_preparations_df = parsed_data['library_preparations_df'] - sequencing_files = parsed_data['sequencing_files'] - sequencing_files_df = parsed_data['sequencing_files_df'] - - # TODO: Handle expression alterations in MODIFY - created_expression_alterations = [] - - if self._is_add_action(): - self._create_submission_envelope() - - parent_cell_line_id = None - - if parent_cell_line_name is not None: - print(f"Creating parental cell line with name {parent_cell_line_name}") - parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) - # TODO: link parental cell line with dataset - - print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") - - if expression_alterations and expression_alterations_df is not None: - created_expression_alterations, expression_alteration_df = self._submit_expression_alterations( - submission_instance, expression_alterations, expression_alterations_df) + try: + """Process the file submission.""" + parser = SpreadsheetSubmitter(self.file) + parsed_data = self._parse_spreadsheet(parser) + self._validate_and_upload(parsed_data, list_of_files_in_upload_area) + + # Extract parsed data + expression_alterations = parsed_data['expression_alterations'] + expression_alterations_df = parsed_data['expression_alterations_df'] + parent_cell_line_name = parsed_data['parent_cell_line_name'] + cell_lines = parsed_data['cell_lines'] + cell_lines_df = parsed_data['cell_lines_df'] + differentiated_cell_lines = parsed_data['differentiated_cell_lines'] + differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + library_preparations = parsed_data['library_preparations'] + library_preparations_df = parsed_data['library_preparations_df'] + sequencing_files = parsed_data['sequencing_files'] + sequencing_files_df = parsed_data['sequencing_files_df'] + + # Initialize lists for created entities + created_expression_alterations = [] + created_cell_lines = [] + created_differentiated_cell_lines = [] + created_library_preparations = [] + created_sequencing_files = [] - if created_expression_alterations and parent_cell_line_id is not None: - self.link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + if self._is_add_action(): + self._create_submission_envelope() + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + created_expression_alterations = self._handle_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df, parent_cell_line_id ) - created_cell_lines = [] - - if cell_lines and cell_lines_df is not None: - created_cell_lines, cell_lines_df = self._create_cell_lines( - submission_instance, cell_lines, cell_lines_df, created_expression_alterations) - - created_differentiated_cell_lines = [] - - if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines, differentiated_cell_lines_df = self._create_differentiated_cell_lines( - submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) - - created_library_preparations = [] - - if library_preparations and library_preparations_df is not None: - created_library_preparations, library_preparations_df = self._create_library_preparations( - submission_instance, library_preparations, library_preparations_df) - - created_sequencing_files = [] - - if sequencing_files and sequencing_files_df is not None: - created_sequencing_files, sequencing_files_df = self._create_sequencing_files( - submission_instance, sequencing_files, sequencing_files_df) - - updated_dfs, message = self._establish_links(submission_instance, - created_cell_lines, - cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, - created_library_preparations, - library_preparations_df, - created_sequencing_files, - sequencing_files_df) - - if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) - else: - return self.delete_actions(self.submission_envelope_id, submission_instance, None) + if cell_lines and cell_lines_df is not None: + created_cell_lines = self._create_cell_lines( + submission_instance, cell_lines, cell_lines_df, created_expression_alterations) + + if differentiated_cell_lines and differentiated_cell_lines_df is not None: + created_differentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + + if library_preparations and library_preparations_df is not None: + created_library_preparations = self._create_library_preparations( + submission_instance, library_preparations, library_preparations_df) + + if sequencing_files and sequencing_files_df is not None: + created_sequencing_files = self._create_sequencing_files( + submission_instance, sequencing_files, sequencing_files_df) + + updated_dfs, message = self._establish_links(submission_instance, + created_cell_lines, + cell_lines_df, + created_differentiated_cell_lines, + differentiated_cell_lines_df, + created_library_preparations, + library_preparations_df, + created_sequencing_files, + sequencing_files_df) + + if message == 'SUCCESS': + self._save_and_upload_results(updated_dfs, expression_alterations_df) + else: + return self._delete_actions(self.submission_envelope_id, submission_instance, None) + except ValidationError as e: + print(f"Validation Error: {e.errors}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except SubmissionError as e: + print(f"Submission Error: {e.errors}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + sys.exit(1) + except Exception as e: + print(f"An unexpected error occurred during submission processing: {e}") + self._delete_actions(self.submission_envelope_id, submission_instance, e) + raise e # Re-raise the exception to propagate it upwards + + def _handle_parent_cell_line(self, submission_instance, parent_cell_line_name): + """Handles the creation of a parent cell line.""" + parent_cell_line_id = None + if parent_cell_line_name: + print(f"Creating parental cell line with name {parent_cell_line_name}") + parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) + print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + return parent_cell_line_id + + def _handle_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_id): + """Handles the creation of expression alterations and links them to the parent cell line if needed.""" + created_expression_alterations = [] + if expression_alterations and expression_alterations_df is not None: + created_expression_alterations = self._submit_expression_alterations( + submission_instance, expression_alterations, expression_alterations_df + ) + if created_expression_alterations and parent_cell_line_id: + self._link_parent_cell_line_expression_alteration( + submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + ) + return created_expression_alterations def _parse_spreadsheet(self, parser): try: + # Determine the necessary sheet names tab_names = parser.list_sheets() - cell_line_sheet_name = None - differentiated_cell_line_sheet_name = None + cell_line_sheet_name = next( + (name for name in ["Cell line", "Clonal cell line"] if name in tab_names), None + ) + differentiated_cell_line_sheet_name = next( + (name for name in ["Differentiated cell line", "Differentiated product"] if name in tab_names), None + ) + undifferentiated_cell_line_sheet_name = ( + "Undifferentiated product" if "Undifferentiated product" in tab_names else None + ) - if "Cell line" in tab_names: - cell_line_sheet_name = "Cell line" - elif "Clonal cell line" in tab_names: - cell_line_sheet_name = "Clonal cell line" - else: - self.validation_errors.append("Spreadsheet must contain a " - "'Cell line' or 'Clonal cell line' sheet.") + undifferentiated_cell_lines = [] + undifferentiated_cell_lines_df = None - if "Differentiated cell line" in tab_names: - differentiated_cell_line_sheet_name = "Differentiated cell line" - # elif "Undifferentiated product" in tab_names: - # differentiated_cell_line_sheet_name = "Undifferentiated product" - elif "Differentiated product" in tab_names: - differentiated_cell_line_sheet_name = "Differentiated product" - else: + # Validate the presence of required sheets + if not cell_line_sheet_name: self.validation_errors.append("Spreadsheet must contain a " - "'Differentiated cell line' or 'Undifferentiated product' " - "or 'Differentiated product' sheet.") + "'Cell line' or 'Clonal cell line' sheet.") + if not (differentiated_cell_line_sheet_name or undifferentiated_cell_line_sheet_name): + self.validation_errors.append( + "Spreadsheet must contain a " + "'Differentiated cell line', 'Undifferentiated product', " + "or 'Differentiated product' sheet." + ) - """Parse the spreadsheet into different sections.""" + # Parse different sections of the spreadsheet expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) - cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( cell_line_sheet_name, self.action, self.validation_errors ) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( differentiated_cell_line_sheet_name, self.action, self.validation_errors ) - merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) + if undifferentiated_cell_line_sheet_name: + undifferentiated_cell_lines, undifferentiated_cell_lines_df = parser.get_undifferentiated_cell_lines( + undifferentiated_cell_line_sheet_name, self.action, self.validation_errors + ) + + # Check for errors and merge data + if differentiated_cell_lines and undifferentiated_cell_lines: + self.validation_errors.append( + "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/products" + ) + + if differentiated_cell_lines: + merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, + self.validation_errors) + if undifferentiated_cell_lines: + merge_cell_line_and_differentiated_cell_line(cell_lines, undifferentiated_cell_lines, + self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) - - merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, - self.validation_errors) + merge_differentiated_cell_line_and_library_preparation( + differentiated_cell_lines, library_preparations, self.validation_errors + ) sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) - merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) + # Return the parsed data as a dictionary return { "expression_alterations": expression_alterations, "expression_alterations_df": expression_alterations_df, @@ -303,6 +348,8 @@ def _parse_spreadsheet(self, parser): "parent_cell_line_name": parent_cell_line_name, "differentiated_cell_lines": differentiated_cell_lines, "differentiated_cell_lines_df": differentiated_cell_lines_df, + "undifferentiated_cell_lines": undifferentiated_cell_lines, + "undifferentiated_cell_lines_df": undifferentiated_cell_lines_df, "library_preparations": library_preparations, "library_preparations_df": library_preparations_df, "sequencing_files": sequencing_files, @@ -342,7 +389,7 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): sys.exit(1) else: # Print the error message - print(e) + print(f"Validation Error: {e.errors}") # Exit the program with a non-zero status code to indicate an error sys.exit(1) @@ -369,7 +416,7 @@ def _create_submission_envelope(self): self.submission_envelope_id = get_id_from_url(submission_envelope_response['_links']['self']['href']) print(f"Submission envelope for this submission is: {self.submission_envelope_id}") else: - raise Exception(f"Failed to create submission envelope. Status code: {status_code}") + raise SubmissionError(f"Failed to create submission envelope. Status code: {status_code}") def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): """Submit the parent cell line.""" @@ -378,14 +425,21 @@ def _submit_parent_cell_line(self, submission_instance, parent_cell_line_name): self.submission_envelope_id, self.access_token ) - def _submit_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df): + def _submit_expression_alterations(self, + submission_instance, + expression_alterations, + expression_alterations_df): """Submit expression alterations.""" return _create_expression_alterations( submission_instance, self.submission_envelope_id, self.access_token, expression_alterations, expression_alterations_df ) - def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, expression_alterations): + def _create_cell_lines(self, + submission_instance, + cell_lines, + cell_lines_df, + expression_alterations): for cell_line in cell_lines: cell_line_entity_id = submission_instance.handle_cell_line(cell_line, expression_alterations, cell_lines_df, self.submission_envelope_id, self.dataset, @@ -393,9 +447,11 @@ def _create_cell_lines(self, submission_instance, cell_lines, cell_lines_df, exp self.submission_errors) cell_line.id = cell_line_entity_id - return cell_lines, cell_lines_df + return cell_lines - def _create_differentiated_cell_lines(self, submission_instance, differentiated_cell_lines, + def _create_differentiated_cell_lines(self, + submission_instance, + differentiated_cell_lines, differentiated_cell_lines_df): for differentiated_cell_line in differentiated_cell_lines: differentiated_cell_line_entity_id = submission_instance.handle_differentiated_cell_line(None, @@ -408,9 +464,11 @@ def _create_differentiated_cell_lines(self, submission_instance, differentiated_ self.submission_errors) differentiated_cell_line.id = differentiated_cell_line_entity_id - return differentiated_cell_lines, differentiated_cell_lines_df + return differentiated_cell_lines - def _create_library_preparations(self, submission_instance, library_preparations, + def _create_library_preparations(self, + submission_instance, + library_preparations, library_preparations_df): for library_preparation in library_preparations: library_preparation_entity_id = submission_instance.handle_library_preparation(None, @@ -423,9 +481,11 @@ def _create_library_preparations(self, submission_instance, library_preparations self.submission_errors) library_preparation.id = library_preparation_entity_id - return library_preparations, library_preparations_df + return library_preparations - def _create_sequencing_files(self, submission_instance, sequencing_files, + def _create_sequencing_files(self, + submission_instance, + sequencing_files, sequencing_files_df): for sequencing_file in sequencing_files: sequencing_file_entity_id = submission_instance.handle_sequencing_file(None, @@ -438,7 +498,7 @@ def _create_sequencing_files(self, submission_instance, sequencing_files, self.submission_errors) sequencing_file.id = sequencing_file_entity_id - return sequencing_files, sequencing_files_df + return sequencing_files def _establish_links(self, submission_instance, @@ -497,7 +557,7 @@ def _save_and_upload_results(self, updated_dfs, expression_alteration_df): except Exception as e: print(f"Failed to upload file {output_file}. Error: {e}, Refer dataset {self.dataset} for tracing metadata") - def delete_actions(self, submission_envelope_id, submission_instance, error=None): + def _delete_actions(self, submission_envelope_id, submission_instance, error=None): """Handle actions needed when a submission fails.""" try: if self._is_add_action(): @@ -533,10 +593,11 @@ def _handle_modify_action_failure(self, error): else: return False, "Submission has failed, rolled back" - def link_parent_cell_line_expression_alteration(self, submission_instance, - access_token, - parent_cell_line_id, - created_expression_alterations): + def _link_parent_cell_line_expression_alteration(self, + submission_instance, + access_token, + parent_cell_line_id, + created_expression_alterations): for expression_alteration in created_expression_alterations: print(f"Linking parent cell line {parent_cell_line_id} " f"as input to process of {expression_alteration.expression_alteration_id}") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 28dad58..23e4595 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -1,3 +1,5 @@ +import traceback + import pandas as pd import json import numpy as np @@ -20,10 +22,35 @@ def add_error(self, missing_type, entity_type, missing_id, errors): class ValidationError(Exception): def __init__(self, errors): self.errors = errors - super().__init__("Validation errors occurred") + super().__init__(self._format_message()) + + def _format_message(self): + # This method formats the error message that will be displayed when the exception is raised. + return "Validation errors occurred:\n" + "\n".join(self.errors) - def __str__(self): - return "\n".join(self.errors) + +class SubmissionError(Exception): + """ + Exception raised for errors during submission. + Includes a list of errors and an optional underlying exception. + """ + + def __init__(self, errors, original_exception=None): + self.errors = errors + self.original_exception = original_exception # Store the original exception + super().__init__(self._format_message()) + + def _format_message(self): + """ + Format the error message to include both the list of submission errors and details of the original exception. + """ + message = "Submission errors occurred:\n" + "\n".join(self.errors) + if self.original_exception: + message += "\n\nOriginal Exception Details:\n" + message += f"Type: {type(self.original_exception).__name__}\n" + message += f"Message: {str(self.original_exception)}\n" + message += "Stack Trace:\n" + "".join(traceback.format_tb(self.original_exception.__traceback__)) + return message """ @@ -37,8 +64,16 @@ def __init__(self, type, id): class CellLine: - def __init__(self, biomaterial_id, description, derived_from_accession, - clone_id, protocol_id, zygosity, cell_type, expression_alteration_id, id): + def __init__(self, + biomaterial_id, + description, + derived_from_accession, + clone_id, + protocol_id, + zygosity, + cell_type, + expression_alteration_id, + id): self.biomaterial_id = biomaterial_id self.description = description self.derived_from_accession = derived_from_accession @@ -81,9 +116,19 @@ def to_dict(self): class ExpressionAlterationStrategy: - def __init__(self, expression_alteration_id, protocol_id, allele_specific, altered_gene_symbols, altered_gene_ids, - targeted_genomic_region, expected_alteration_type, sgrna_target, - protocol_method_text, altered_locus, guide_sequence, id): + def __init__(self, + expression_alteration_id, + protocol_id, + allele_specific, + altered_gene_symbols, + altered_gene_ids, + targeted_genomic_region, + expected_alteration_type, + sgrna_target, + protocol_method_text, + altered_locus, + guide_sequence, + id): self.expression_alteration_id = expression_alteration_id self.protocol_id = protocol_id self.allele_specific = allele_specific @@ -120,8 +165,16 @@ def to_dict(self): class DifferentiatedCellLine: - def __init__(self, biomaterial_id, description, input_biomaterial_id, protocol_id, timepoint_value, timepoint_unit, - terminally_differentiated, model_system, id): + def __init__(self, + biomaterial_id, + description, + input_biomaterial_id, + protocol_id, + timepoint_value, + timepoint_unit, + terminally_differentiated, + model_system, + id): self.biomaterial_id = biomaterial_id self.description = description self.input_biomaterial_id = input_biomaterial_id @@ -162,10 +215,21 @@ def to_dict(self): class LibraryPreparation: - def __init__(self, biomaterial_id, protocol_id, dissociation_protocol_id, differentiated_biomaterial_id, - average_fragment_size, input_amount_value, input_amount_unit, - final_yield_value, final_yield_unit, concentration_value, concentration_unit, - pcr_cycles, pcr_cycles_for_sample_index, id): + def __init__(self, + biomaterial_id, + protocol_id, + dissociation_protocol_id, + differentiated_biomaterial_id, + average_fragment_size, + input_amount_value, + input_amount_unit, + final_yield_value, + final_yield_unit, + concentration_value, + concentration_unit, + pcr_cycles, + pcr_cycles_for_sample_index, + id): self.biomaterial_id = biomaterial_id self.protocol_id = protocol_id self.dissociation_protocol_id = dissociation_protocol_id @@ -226,8 +290,17 @@ class EntityType: class SequencingFile: - def __init__(self, file_name, extension, read_index, lane_index=None, read_length=None, checksum=None, - library_preparation_id=None, sequencing_protocol_id=None, run_id=None, id=None): + def __init__(self, + file_name, + extension, + read_index, + lane_index=None, + read_length=None, + checksum=None, + library_preparation_id=None, + sequencing_protocol_id=None, + run_id=None, + id=None): self.file_name = file_name self.extension = extension self.read_index = read_index @@ -272,8 +345,13 @@ def convert_to_valid_json_value(value): } -def find_orphans(source_entities, target_entities, - source_attr, target_attr, source_type, target_type, errors): +def find_orphans(source_entities, + target_entities, + source_attr, + target_attr, + source_type, + target_type, + errors): """ Validates that each source entity has a corresponding target entity. @@ -303,7 +381,9 @@ def find_orphans(source_entities, target_entities, # print(f"VALIDATED: All {source_type.lower()}s have corresponding {target_type.lower()}s.") -def merge_library_preparation_sequencing_file(library_preparations, sequencing_files, errors): +def merge_library_preparation_sequencing_file(library_preparations, + sequencing_files, + errors): """ Merges library preparations and sequencing files based on their IDs. @@ -349,7 +429,8 @@ def merge_library_preparation_sequencing_file(library_preparations, sequencing_f library_preparation.add_sequencing_file(sequencing_file) -def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, library_preparations, +def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_lines, + library_preparations, errors): """ Merges differentiated cell lines and library preparations based on their biomaterial IDs. @@ -399,7 +480,8 @@ def merge_differentiated_cell_line_and_library_preparation(differentiated_cell_l def merge_cell_line_and_differentiated_cell_line(cell_lines, - differentiated_cell_lines, errors): + differentiated_cell_lines, + errors): """ Merges cell lines and differentiated cell lines based on their biomaterial IDs. @@ -537,11 +619,14 @@ def input_file_to_data_frames(self, sheet_name, action): df = pd.read_excel(self.file_path, sheet_name=sheet_names[trimmed_sheet_name], engine='openpyxl', skiprows=skip_rows) else: - raise ValueError(f"Sheet '{sheet_name}' not found in the spreadsheet.") + raise ValidationError(f"Sheet '{sheet_name}' not found in the spreadsheet.") return df - def parse_cell_lines(self, sheet_name, action, errors): + def parse_cell_lines(self, + sheet_name, + action, + errors): """ Parses data related to cell lines from a specified sheet in the Excel file. @@ -627,7 +712,10 @@ def parse_cell_lines(self, sheet_name, action, errors): return cell_lines, df_filtered, parent_cell_line_names[0] - def parse_differentiated_cell_lines(self, sheet_name, action, errors): + def parse_differentiated_cell_lines(self, + sheet_name, + action, + errors): """ Parses data related to differentiated cell lines from a specified sheet in the Excel file. @@ -651,8 +739,8 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: - errors.append("The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - "exist.") + errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + f"exist in {sheet_name}.") return [], df # Filter rows where biomaterial_id is not null @@ -705,7 +793,91 @@ def parse_differentiated_cell_lines(self, sheet_name, action, errors): return differentiated_cell_lines, df_filtered - def parse_library_preparations(self, sheet_name, action, errors): + def parse_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Parses data related to differentiated cell lines from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + df = self.input_file_to_data_frames(sheet_name=sheet_name, action=action) + df.columns = df.columns.str.strip() + # df = df.rename(columns=column_mapping) + # Remove unnamed columns (columns without headers) + # df = df.loc[:, ~df.columns.str.startswith('Unnamed')] + + # Check if the required column exists + if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: + errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " + f"exist in {sheet_name}.") + return [], df + + # Filter rows where biomaterial_id is not null + df = df[df['differentiated_cell_line.biomaterial_core.biomaterial_id'].notna()] + df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) + # Define columns to check for values starting with 'ABC' or 'XYZ' + cols_to_check = ['differentiated_cell_line.biomaterial_core.biomaterial_id'] + # Create a mask to filter rows where any of the specified columns start with 'ABC' or 'XYZ' + mask = df[cols_to_check].apply(lambda x: ~x.astype(str).str.startswith( + ('FILL OUT INFORMATION BELOW THIS ROW', 'A unique ID for the biomaterial.', + 'differentiated_cell_line.biomaterial_core.biomaterial_id'))).all(axis=1) + # Apply the mask to filter out rows + df_filtered = df[mask] + # Check for mandatory fields and create Differentiated CellLine objects + undifferentiated_cell_lines = [] + + for _, row in df_filtered.iterrows(): + differentiated_biomaterial_id = row['differentiated_cell_line.biomaterial_core.biomaterial_id'] + biomaterial_id = row.get('cell_line.biomaterial_core.biomaterial_id') + + # Check if biomaterial_id is null + if pd.isnull(differentiated_biomaterial_id): + errors.append("Differentiated Cell line ID cannot be null in any row of the Differentiated Cell line " + "sheet.") + # raise MissingMandatoryFieldError("Differentiated Cell line ID cannot be null in any row.") + + # Check if derived_accession and cell_type are present + if pd.isnull(biomaterial_id): + errors.append(f"Input Cell line ID cannot be null for Differentiated Cell line: " + f"{differentiated_biomaterial_id}") + """ + raise MissingMandatoryFieldError( + "Input Cell line ID cannot be null. " + differentiated_biomaterial_id) + """ + + # Create DifferentiatedCellLine objects from filtered DataFrame rows + undifferentiated_cell_lines.append( + DifferentiatedCellLine( + biomaterial_id=differentiated_biomaterial_id, + description=row.get('differentiated_cell_line.biomaterial_core.biomaterial_description'), + input_biomaterial_id=biomaterial_id, + protocol_id=row.get('differentiation_protocol.protocol_core.protocol_id'), + timepoint_value=row.get('differentiated_cell_line.timepoint_value'), + timepoint_unit=row.get('differentiated_cell_line.timepoint_unit.text'), + terminally_differentiated=row.get('differentiated_cell_line.terminally_differentiated'), + model_system=row.get('differentiated_cell_line.model_organ.text'), + id=row.get('Id') + ) + ) + + return undifferentiated_cell_lines, df_filtered + + def parse_library_preparations(self, + sheet_name, + action, + errors): """ Parses data related to library preparations from a specified sheet in the Excel file. @@ -795,7 +967,10 @@ def parse_library_preparations(self, sheet_name, action, errors): return library_preparations, df_filtered - def parse_sequencing_files(self, sheet_name, action, errors): + def parse_sequencing_files(self, + sheet_name, + action, + errors): """ Parses data related to sequencing files from a specified sheet in the Excel file. @@ -884,7 +1059,10 @@ def parse_sequencing_files(self, sheet_name, action, errors): return sequencing_files, df_filtered - def parse_expression_alteration(self, sheet_name, action, errors): + def parse_expression_alteration(self, + sheet_name, + action, + errors): """ Parses data related to expression alterations from a specified sheet in the Excel file. @@ -965,7 +1143,10 @@ def parse_expression_alteration(self, sheet_name, action, errors): # Return the list of objects, the filtered DataFrame, and a flag indicating success return expression_alterations, df_filtered - def get_cell_lines(self, sheet_name, action, errors): + def get_cell_lines(self, + sheet_name, + action, + errors): """ Retrieves parsed cell lines data from a specified sheet in the Excel file. @@ -984,7 +1165,10 @@ def get_cell_lines(self, sheet_name, action, errors): cell_lines, cell_lines_df, parent_cell_line_name = self.parse_cell_lines(sheet_name, action, errors) return cell_lines, cell_lines_df, parent_cell_line_name - def get_differentiated_cell_lines(self, sheet_name, action, errors): + def get_differentiated_cell_lines(self, + sheet_name, + action, + errors): """ Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. @@ -1004,7 +1188,34 @@ def get_differentiated_cell_lines(self, sheet_name, action, errors): action, errors) return differentiated_cell_lines, differentiated_cell_lines_df - def get_library_preparations(self, sheet_name, action, errors): + def get_undifferentiated_cell_lines(self, + sheet_name, + action, + errors): + """ + Retrieves parsed differentiated cell lines data from a specified sheet in the Excel file. + + Parameters: + ----------- + sheet_name : str + The name of the sheet containing differentiated cell line data. + column_mapping : dict + A dictionary mapping column names in the sheet to expected attribute names. + + Returns: + -------- + list + A list of DifferentiatedCellLine objects parsed from the specified sheet. + """ + undifferentiated_cell_lines, undifferentiated_cell_lines_df = self.parse_undifferentiated_cell_lines(sheet_name, + action, + errors) + return undifferentiated_cell_lines, undifferentiated_cell_lines_df + + def get_library_preparations(self, + sheet_name, + action, + errors): """ Retrieves parsed library preparations data from a specified sheet in the Excel file. @@ -1024,7 +1235,10 @@ def get_library_preparations(self, sheet_name, action, errors): action, errors) return library_preparations, df_filtered - def get_sequencing_files(self, sheet_name, action, errors): + def get_sequencing_files(self, + sheet_name, + action, + errors): """ Retrieves parsed sequencing files data from a specified sheet in the Excel file. @@ -1043,6 +1257,9 @@ def get_sequencing_files(self, sheet_name, action, errors): sequencing_files, df_filtered = self.parse_sequencing_files(sheet_name, action, errors) return sequencing_files, df_filtered - def get_expression_alterations(self, sheet_name, action, errors): + def get_expression_alterations(self, + sheet_name, + action, + errors): expression_alterations, df_filtered = self.parse_expression_alteration(sheet_name, action, errors) return expression_alterations, df_filtered From f15c6f2a691849ce20b5a7e23b5052e4d2e29619 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 5 Sep 2024 14:51:51 +0100 Subject: [PATCH 5/8] better error handling --- ait/commons/util/command/submit_file.py | 7 ++++--- ait/commons/util/spreadsheet_util.py | 15 ++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 9079a4e..405d9e1 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -234,7 +234,7 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) return self._delete_actions(self.submission_envelope_id, submission_instance, None) except ValidationError as e: print(f"Validation Error: {e.errors}") - self._delete_actions(self.submission_envelope_id, submission_instance, e) + # self._delete_actions(self.submission_envelope_id, submission_instance, e) sys.exit(1) except SubmissionError as e: print(f"Submission Error: {e.errors}") @@ -389,9 +389,10 @@ def _validate_and_upload(self, parsed_data, list_of_files_in_upload_area): sys.exit(1) else: # Print the error message - print(f"Validation Error: {e.errors}") + # print(f"Validation Error: {e.errors}") # Exit the program with a non-zero status code to indicate an error - sys.exit(1) + # sys.exit(1) + raise ValidationError(self.validation_errors) print(f"File {self.file} is validated successfully. Initiating submission") print(f"File {self.file} being uploaded to storage") diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 23e4595..8c050a6 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -649,8 +649,8 @@ def parse_cell_lines(self, # Check if the required column exists if 'cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append( - "The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the Cell line sheet. " - "The rest of the file will not be processed") + f"The column 'cell_line.biomaterial_core.biomaterial_id' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -740,7 +740,7 @@ def parse_differentiated_cell_lines(self, # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - f"exist in {sheet_name}.") + f"exist in {sheet_name} name. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -821,7 +821,7 @@ def parse_undifferentiated_cell_lines(self, # Check if the required column exists if 'differentiated_cell_line.biomaterial_core.biomaterial_id' not in df.columns: errors.append(f"The column 'differentiated_cell_line.biomaterial_core.biomaterial_id' does not " - f"exist in {sheet_name}.") + f"exist in {sheet_name}. The rest of the file will not be processed") return [], df # Filter rows where biomaterial_id is not null @@ -906,7 +906,8 @@ def parse_library_preparations(self, for col in required_columns: if col not in df.columns: - errors.append(f"The column '{col}' does not exist in the Library Preparation sheet.") + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df @@ -1001,7 +1002,8 @@ def parse_sequencing_files(self, for col in required_columns: if col not in df.columns: - errors.append(f"The column '{col}' does not exist in the Sequencing File sheet.") + errors.append(f"The column '{col}' does not exist in the {sheet_name} sheet. " + f"The rest of the file will not be processed") return [], df @@ -1104,7 +1106,6 @@ def parse_expression_alteration(self, # Filter rows where 'expression_alteration_id' is not null df = df[df['expression_alteration_id'].notna()] - # Replace invalid float values (e.g., NaN, infinite) with None df = df.map(lambda x: None if isinstance(x, float) and (np.isnan(x) or not np.isfinite(x)) else x) From 24d946f1681d03f9cca9b8c6c6ee389196498367 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 6 Sep 2024 17:18:23 +0100 Subject: [PATCH 6/8] correct name for expression_alteration_id while object construction --- ait/commons/util/spreadsheet_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/spreadsheet_util.py b/ait/commons/util/spreadsheet_util.py index 8c050a6..ff9fec2 100644 --- a/ait/commons/util/spreadsheet_util.py +++ b/ait/commons/util/spreadsheet_util.py @@ -148,7 +148,7 @@ def __repr__(self): def to_dict(self): return { "content": { - "label": self.expression_alteration_id, + "expression_alteration_label": self.expression_alteration_id, "protocol_id": self.protocol_id, "allele_specific": self.allele_specific, "altered_gene_symbols": self.altered_gene_symbols, From ea8b62f7535a4e225b94238b4144890e0c04823c Mon Sep 17 00:00:00 2001 From: dgupta Date: Tue, 10 Sep 2024 15:55:35 +0100 Subject: [PATCH 7/8] increment version --- ait/commons/util/settings/morphic_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ait/commons/util/settings/morphic_util.py b/ait/commons/util/settings/morphic_util.py index 2655adf..382b391 100644 --- a/ait/commons/util/settings/morphic_util.py +++ b/ait/commons/util/settings/morphic_util.py @@ -1,7 +1,7 @@ from pathlib import Path NAME = 'morphic-util' -VERSION = '1.0.0' +VERSION = '1.0.1' DESC = 'CLI tool for submitting your analysis data and metadata' AUTHOR = 'dgupta' AUTHOR_EMAIL = 'dgupta@ebi.ac.uk' From 76ff747795e11adac8475f2eb0cad19581e120ea Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Mon, 30 Sep 2024 16:56:40 +0100 Subject: [PATCH 8/8] improvements --- ait/commons/util/aws_client.py | 1 - ait/commons/util/command/submit.py | 164 +++++++++++++----------- ait/commons/util/command/submit_file.py | 127 +++++++++++++----- 3 files changed, 189 insertions(+), 103 deletions(-) diff --git a/ait/commons/util/aws_client.py b/ait/commons/util/aws_client.py index 60deab1..dea6c66 100755 --- a/ait/commons/util/aws_client.py +++ b/ait/commons/util/aws_client.py @@ -1,7 +1,6 @@ import json import boto3 -import botocore from ait.commons.util.aws_cognito_authenticator import AwsCognitoAuthenticator from ait.commons.util.settings import AWS_SECRET_NAME_AK_BUCKET, AWS_SECRET_NAME_SK_BUCKET, \ diff --git a/ait/commons/util/command/submit.py b/ait/commons/util/command/submit.py index 6d0b5c5..ad17000 100644 --- a/ait/commons/util/command/submit.py +++ b/ait/commons/util/command/submit.py @@ -18,7 +18,7 @@ def matching_expression_alteration_and_cell_line(cell_line, expression_alteratio " ", "").strip() -def get_id_from_url(url): +def get_entity_id_from_hal_link(url): """ Extracts and returns the ID from a given URL. @@ -136,7 +136,7 @@ def post_to_provider_api_and_get_entity_id(url, data, access_token): response_data = response.json() entity_url = response_data['_links']['self']['href'] - return get_id_from_url(entity_url) + return get_entity_id_from_hal_link(entity_url) def post_to_provider_api(url, data_type_in_hal_link, data, access_token): @@ -169,9 +169,9 @@ class CmdSubmit: A class to handle submission of studies, datasets, and biomaterials to a server. Attributes: - base_url (str): The base URL for the server. - submission_envelope_create_url (str): URL for creating submission envelopes. - submission_envelope_base_url (str): Base URL for submission envelopes. + BASE_URL (str): The base URL for the server. + SUBMISSION_ENVELOPE_CREATE_URL (str): URL for creating submission envelopes. + SUBMISSION_ENVELOPE_BASE_URL (str): Base URL for submission envelopes. args (Namespace): Command-line arguments. access_token (str): Access token for authorization. type (str): Type of submission (study, dataset, or biomaterial). @@ -191,9 +191,9 @@ class CmdSubmit: transform(file): Transforms the input file to a JSON object. put_to_provider_api(url, access_token): Sends a PUT request to the provider API. """ - base_url = 'https://api.ingest.dev.archive.morphic.bio' - submission_envelope_create_url = f"{base_url}/submissionEnvelopes/updateSubmissions" - submission_envelope_base_url = f"{base_url}/submissionEnvelopes" + BASE_URL = 'http://localhost:8080' + SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" + SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" def __init__(self, args): """ @@ -206,7 +206,7 @@ def __init__(self, args): self.access_token = get_profile('morphic-util').access_token self.type = getattr(self.args, 'type', None) self.file = getattr(self.args, 'file', None) - self.provider_api = APIProvider(self.base_url) + self.provider_api = APIProvider(self.BASE_URL) def run(self): """ @@ -311,11 +311,11 @@ def link_cell_line_with_expression_alterations(self, for expression_alteration in expression_alterations: if cell_line.expression_alteration_id is not None: if matching_expression_alteration_and_cell_line(cell_line, expression_alteration): - print(f"Linking cell line {cell_line_entity_id} " + print(f"Linking cell line {cell_line.biomaterial_id} " f"as derived by process of {expression_alteration.expression_alteration_id}") self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/derivedByProcesses", expression_alteration.id, 'processes', access_token ) @@ -447,8 +447,8 @@ def create_differentiated_cell_line_entity(self, def link_cell_line_and_differentiated_cell_line(self, access_token, - cell_line_entity_id, - differentiated_entity_id, + cell_line, + differentiated_or_undifferentiated_cell_line, dataset_id, submission_envelope_id, action, @@ -475,9 +475,13 @@ def link_cell_line_and_differentiated_cell_line(self, The ID of the differentiation process entity created. """ if action.lower() != 'modify': + cell_line_biomaterial_id = cell_line.biomaterial_id + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + try: + print( - f"Cell line {cell_line_entity_id} has differentiated cell lines, creating differentiation process " + f"Cell line {cell_line_biomaterial_id} has differentiated cell lines, creating differentiation process " f"to link them") # Create a differentiation process entity @@ -489,27 +493,31 @@ def link_cell_line_and_differentiated_cell_line(self, ) print( - f"Linking Cell Line Biomaterial: {cell_line_entity_id} as input to process : {differentiation_process_entity_id}") + f"Linking Cell Line Biomaterial: {cell_line_biomaterial_id} as input to process : {differentiation_process_entity_id}") # Link the cell line entity as input to the differentiation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{cell_line_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{cell_line.id}/inputToProcesses", differentiation_process_entity_id, 'processes', access_token ) - print(f"Linking Differentiated cell line Biomaterial: {differentiated_entity_id} " - f"as derived by process : {differentiation_process_entity_id}") + print( + f"Linking Differentiated cell line Biomaterial: " + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as derived by process : {differentiation_process_entity_id}") # Link the differentiated cell line entity as derived by the differentiation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}" + f"/derivedByProcesses", differentiation_process_entity_id, 'processes', access_token ) return differentiation_process_entity_id except Exception as e: errors.append( - f"Failed to update relations between Cell line {cell_line_entity_id} and Differentiated cell line {differentiated_entity_id}") + f"Failed to update relations between Cell line {cell_line_biomaterial_id} " + f"and Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id}") raise SubmissionError(errors, e) def handle_library_preparation(self, @@ -637,8 +645,8 @@ def create_library_preparation_entity(self, def link_differentiated_and_library_preparation(self, access_token, - differentiated_entity_id, - library_preparation_entity_id, + differentiated_or_undifferentiated_cell_line, + library_preparation, dataset_id, submission_envelope_id, action, @@ -665,8 +673,12 @@ def link_differentiated_and_library_preparation(self, The ID of the library preparation process entity created. """ if action.lower() != 'modify': + differentiated_or_undifferentiated_cell_line_biomaterial_id = differentiated_or_undifferentiated_cell_line.biomaterial_id + library_preparation_biomaterial_id = library_preparation.biomaterial_id + try: - print(f"Differentiated cell line {differentiated_entity_id} has library preparations, creating library " + print(f"Differentiated cell line {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"has library preparations, creating library " f"preparation process to link them") # Create a library preparation process entity @@ -678,22 +690,22 @@ def link_differentiated_and_library_preparation(self, ) print( - f"Linking Differentiated Cell Line Biomaterial: {differentiated_entity_id} as input to library " - f"preparation process") + f"Linking Differentiated Cell Line Biomaterial: {differentiated_or_undifferentiated_cell_line_biomaterial_id} " + f"as input to library preparation process") # Link the differentiated cell line entity as input to the library preparation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{differentiated_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{differentiated_or_undifferentiated_cell_line.id}/inputToProcesses", library_preparation_process_entity_id, 'processes', access_token ) print( - f"Linking Library Preparation Biomaterial: {library_preparation_entity_id} as derived by library " - f"preparation process") + f"Linking Library Preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as derived by library preparation process") # Link the library preparation entity as derived by the library preparation process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/derivedByProcesses", library_preparation_process_entity_id, 'processes', access_token ) @@ -701,8 +713,8 @@ def link_differentiated_and_library_preparation(self, except Exception as e: errors.append( f"Failed to update relations between Differentiated Cell line " - f"{differentiated_entity_id} and Library preparation" - f" {library_preparation_entity_id}") + f"{differentiated_or_undifferentiated_cell_line_biomaterial_id} and Library preparation" + f" {library_preparation_biomaterial_id}") raise SubmissionError(errors, e) def handle_sequencing_file(self, library_preparation_entity_id, sequencing_file, @@ -799,8 +811,8 @@ def create_sequencing_file_entity(self, access_token, dataset_id, library_prepar def link_library_preparation_and_sequencing_file(self, access_token, - library_preparation_entity_id, - sequencing_file_entity_id, + library_preparation, + sequencing_file, dataset_id, submission_envelope_id, action, @@ -827,9 +839,12 @@ def link_library_preparation_and_sequencing_file(self, The ID of the sequencing process entity created. """ if action.lower() != 'modify': + library_preparation_biomaterial_id = library_preparation.biomaterial_id + sequence_file_name = sequencing_file.file_name + try: - print(f"Library preparation {library_preparation_entity_id} has generated sequencing files." - f"Creating sequencing process to link the sequencing file") + print(f"Library preparation {library_preparation_biomaterial_id} has " + f"generated sequencing files. Creating sequencing process to link the sequencing file") # Create a sequencing process entity sequencing_process_entity_id = self.create_process(access_token, @@ -838,20 +853,22 @@ def link_library_preparation_and_sequencing_file(self, submission_envelope_id) print( - f"Linking Library preparation Biomaterial: {library_preparation_entity_id} as input to process: {sequencing_process_entity_id}") + f"Linking Library preparation Biomaterial: {library_preparation_biomaterial_id} " + f"as input to process: {sequencing_process_entity_id}") # Link the library preparation entity as input to the sequencing process self.perform_hal_linkage( - f"{self.base_url}/biomaterials/{library_preparation_entity_id}/inputToProcesses", + f"{self.BASE_URL}/biomaterials/{library_preparation.id}/inputToProcesses", sequencing_process_entity_id, 'processes', access_token ) print( - f"Linking Sequencing file: {sequencing_file_entity_id} as derived by process: {sequencing_process_entity_id}") + f"Linking Sequencing file: {sequence_file_name} as derived by process: " + f"{sequencing_process_entity_id}") # Link the sequencing file entity as derived by the sequencing process self.perform_hal_linkage( - f"{self.base_url}/files/{sequencing_file_entity_id}/derivedByProcesses", + f"{self.BASE_URL}/files/{sequencing_file.id}/derivedByProcesses", sequencing_process_entity_id, 'processes', access_token ) @@ -859,8 +876,8 @@ def link_library_preparation_and_sequencing_file(self, except Exception as e: errors.append( f"Failed to update relations between Library Preparation " - f"{library_preparation_entity_id} and Sequencing file" - f" {sequencing_file_entity_id}") + f"{library_preparation_biomaterial_id} and Sequencing file" + f" {sequence_file_name}") raise SubmissionError(errors, e) def create_process(self, access_token, dataset_id, process_data, submission_envelope_id): @@ -881,8 +898,8 @@ def create_process(self, access_token, dataset_id, process_data, submission_enve def establish_links(self, cell_lines, cell_lines_df, - differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, library_preparations, library_preparations_df, sequencing_files, @@ -911,22 +928,22 @@ def establish_links(self, """ try: for cell_line in cell_lines: - for differentiated_cell_line in differentiated_cell_lines: - if cell_line.biomaterial_id == differentiated_cell_line.input_biomaterial_id: + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: + if cell_line.biomaterial_id == differentiated_or_undifferentiated_cell_line.input_biomaterial_id: self.link_cell_line_and_differentiated_cell_line(access_token, - cell_line.id, - differentiated_cell_line.id, + cell_line, + differentiated_or_undifferentiated_cell_line, dataset_id, submission_envelope_id, action, errors) - for differentiated_cell_line in differentiated_cell_lines: + for differentiated_or_undifferentiated_cell_line in differentiated_or_undifferentiated_cell_lines: for library_preparation in library_preparations: - if differentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: + if differentiated_or_undifferentiated_cell_line.biomaterial_id == library_preparation.differentiated_biomaterial_id: self.link_differentiated_and_library_preparation( access_token, - differentiated_cell_line.id, - library_preparation.id, + differentiated_or_undifferentiated_cell_line, + library_preparation, dataset_id, submission_envelope_id, action, @@ -936,8 +953,8 @@ def establish_links(self, for sequencing_file in sequencing_files: if library_preparation.biomaterial_id == sequencing_file.library_preparation_id: self.link_library_preparation_and_sequencing_file(access_token, - library_preparation.id, - sequencing_file.id, + library_preparation, + sequencing_file, dataset_id, submission_envelope_id, action, @@ -955,7 +972,7 @@ def establish_links(self, # sequencing_files_df = None return ([cell_lines_df, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines_df, library_preparations_df, sequencing_files_df], message) @@ -1027,15 +1044,16 @@ def create_new_envelope_and_submit_entity(self, input_entity_type, data, access_ 'biomaterial': 'biomaterials', 'process': 'processes' } + hal_entity = entity_map.get(input_entity_type) if not hal_entity: return None - entity_create_url = post_to_provider_api(self.submission_envelope_create_url, hal_entity, data, + entity_create_url = post_to_provider_api(self.SUBMISSION_ENVELOPE_CREATE_URL, hal_entity, data, access_token) entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) - entity_id = get_id_from_url(entity_self_hal_link) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -1054,7 +1072,7 @@ def patch_entity(self, input_entity_type, id, data, access_token): if not hal_entity: return False - entity_patch_url = f"{self.base_url}/{hal_entity}/{id}" + entity_patch_url = f"{self.BASE_URL}/{hal_entity}/{id}" return self.patch_to_provider_api(entity_patch_url, data, access_token) def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token): @@ -1068,7 +1086,7 @@ def link_to_dataset(self, input_entity_type, dataset_id, entity_id, access_token if not hal_entity: return False - put_url = f"{self.base_url}/datasets/{dataset_id}/{hal_entity}/{entity_id}" + put_url = f"{self.BASE_URL}/datasets/{dataset_id}/{hal_entity}/{entity_id}" return self.provider_api.put(put_url, access_token) def patch_to_provider_api(self, entity_patch_url, data, access_token): @@ -1105,9 +1123,9 @@ def use_existing_envelope_and_submit_entity(self, input_entity_type, data, submi if not hal_entity: return None - entity_create_url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/{hal_entity}" + entity_create_url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/{hal_entity}" entity_self_hal_link = post_to_provider_api(entity_create_url, 'self', data, access_token) - entity_id = get_id_from_url(entity_self_hal_link) + entity_id = get_entity_id_from_hal_link(entity_self_hal_link) print(f"{input_entity_type.capitalize()} created successfully: {entity_id}") @@ -1124,7 +1142,7 @@ def link_dataset_to_study(self, dataset_id, study_id, access_token): """ print(f"Linking dataset {dataset_id} to study {study_id}") - url = f"{self.base_url}/studies/{study_id}/datasets/{dataset_id}" + url = f"{self.BASE_URL}/studies/{study_id}/datasets/{dataset_id}" self.provider_api.put(url, access_token) print(f"Dataset linked successfully to study: {study_id}") @@ -1140,7 +1158,7 @@ def link_biomaterial_to_dataset(self, biomaterial_id, dataset_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to dataset {dataset_id}") - url = f"{self.base_url}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" + url = f"{self.BASE_URL}/datasets/{dataset_id}/biomaterials/{biomaterial_id}" self.provider_api.put(url, access_token) print(f"Biomaterial linked successfully to dataset: {dataset_id}") @@ -1156,7 +1174,7 @@ def link_biomaterial_to_process(self, biomaterial_id, process_id, access_token): """ print(f"Linking biomaterial {biomaterial_id} to process {process_id}") - url = f"{self.base_url}/biomaterials/{biomaterial_id}/inputToProcesses" + url = f"{self.BASE_URL}/biomaterials/{biomaterial_id}/inputToProcesses" self.perform_hal_linkage(url, process_id, 'processes', access_token) def delete_submission(self, submission_envelope_id, access_token, force_delete=False): @@ -1171,7 +1189,7 @@ def delete_submission(self, submission_envelope_id, access_token, force_delete=F Returns: bool: True if the deletion was successful, False otherwise. """ - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}" headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {access_token}' @@ -1201,7 +1219,7 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): 'Authorization': f'Bearer {access_token}' } - response = requests.post(url, headers=headers, data=f"{self.base_url}/{link_to}/{input_id}") + response = requests.post(url, headers=headers, data=f"{self.BASE_URL}/{link_to}/{input_id}") if response.status_code != 200: raise Exception(f"Failed to link biomaterial to process {input_id}. " @@ -1210,7 +1228,7 @@ def perform_hal_linkage(self, url, input_id, link_to, access_token): print("Linkage successful") def create_child_biomaterial(self, cell_line_entity_id, body, access_token): - url = f"{self.base_url}/biomaterials/{cell_line_entity_id}/childBiomaterials" + url = f"{self.BASE_URL}/biomaterials/{cell_line_entity_id}/childBiomaterials" entity_id = post_to_provider_api_and_get_entity_id(url, body, access_token) return entity_id @@ -1226,10 +1244,10 @@ def link_entity_to_envelope(self, type, entity_id, submission_envelope_id, acces access_token (str): Access token for authorization. """ if type == 'biomaterial': - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/biomaterials/{entity_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/biomaterials/{entity_id}" self.provider_api.put(url, access_token) elif type == 'file': - url = f"{self.submission_envelope_base_url}/{submission_envelope_id}/files/{entity_id}" + url = f"{self.SUBMISSION_ENVELOPE_BASE_URL}/{submission_envelope_id}/files/{entity_id}" self.provider_api.put(url, access_token) def delete_dataset(self, dataset, access_token): @@ -1240,7 +1258,7 @@ def delete_dataset(self, dataset, access_token): dataset (str): The ID of the dataset to delete. access_token (str): Access token for authorization. """ - fetched_dataset = self.provider_api.get(f"{self.base_url}/datasets/{dataset}", access_token) + fetched_dataset = self.provider_api.get(f"{self.BASE_URL}/datasets/{dataset}", access_token) print(f"Dataset fetched successfully: {dataset}") print(f"Initiating delete of {dataset}") @@ -1251,17 +1269,17 @@ def delete_dataset(self, dataset, access_token): print("Deleting Biomaterials:") for biomaterial in biomaterials: print(f"Deleting {biomaterial}") - self.provider_api.delete(f"{self.base_url}/biomaterials/{biomaterial}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/biomaterials/{biomaterial}", access_token) print("\nDeleting Processes:") for process in processes: print(f"Deleting {process}") - self.provider_api.delete(f"{self.base_url}/processes/{process}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/processes/{process}", access_token) print("\nDeleting Data Files:") for data_file in data_files: print(f"Deleting {data_file}") - self.provider_api.delete(f"{self.base_url}/files/{data_file}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/files/{data_file}", access_token) print(f"\nDeleting the dataset: {dataset}") - self.provider_api.delete(f"{self.base_url}/datasets/{dataset}", access_token) + self.provider_api.delete(f"{self.BASE_URL}/datasets/{dataset}", access_token) diff --git a/ait/commons/util/command/submit_file.py b/ait/commons/util/command/submit_file.py index 405d9e1..2ac893f 100644 --- a/ait/commons/util/command/submit_file.py +++ b/ait/commons/util/command/submit_file.py @@ -7,7 +7,7 @@ import pandas as pd from ait.commons.util.aws_client import Aws from ait.commons.util.command.list import CmdList -from ait.commons.util.command.submit import CmdSubmit, get_id_from_url, create_new_submission_envelope +from ait.commons.util.command.submit import CmdSubmit, get_entity_id_from_hal_link, create_new_submission_envelope from ait.commons.util.command.upload import CmdUpload from ait.commons.util.user_profile import get_profile from ait.commons.util.provider_api_util import APIProvider @@ -73,7 +73,7 @@ def _create_expression_alterations(submission_instance, class CmdSubmitFile: - BASE_URL = 'https://api.ingest.dev.archive.morphic.bio' + BASE_URL = 'http://localhost:8080' SUBMISSION_ENVELOPE_CREATE_URL = f"{BASE_URL}/submissionEnvelopes/updateSubmissions" SUBMISSION_ENVELOPE_BASE_URL = f"{BASE_URL}/submissionEnvelopes" @@ -98,7 +98,7 @@ def __init__(self, args): self.dataset = self._get_required_arg('dataset', ( "Dataset is mandatory to be registered before submitting dataset metadata. " "Please submit your study using the submit option, register your dataset using " - "the same option, and link your dataset to your study before proceeding with this submission." + "the submit option, and link your dataset to your study before proceeding with this submission." )) # Validate file argument only if action is not DELETE @@ -183,23 +183,39 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) cell_lines_df = parsed_data['cell_lines_df'] differentiated_cell_lines = parsed_data['differentiated_cell_lines'] differentiated_cell_lines_df = parsed_data['differentiated_cell_lines_df'] + undifferentiated_cell_lines = parsed_data['undifferentiated_cell_lines'] + undifferentiated_cell_lines_df = parsed_data['undifferentiated_cell_lines_df'] library_preparations = parsed_data['library_preparations'] library_preparations_df = parsed_data['library_preparations_df'] sequencing_files = parsed_data['sequencing_files'] sequencing_files_df = parsed_data['sequencing_files_df'] + differentiated = parsed_data['differentiated'] + cell_line_sheet_name = parsed_data['cell_line_sheet_name'] + + if differentiated: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'differentiated_cell_line_sheet_name'] + else: + differentiated_or_undifferentiated_cell_line_sheet_name = parsed_data[ + 'undifferentiated_cell_line_sheet_name'] # Initialize lists for created entities created_expression_alterations = [] created_cell_lines = [] - created_differentiated_cell_lines = [] + created_differentiated_or_undifferentiated_cell_lines = [] created_library_preparations = [] created_sequencing_files = [] if self._is_add_action(): self._create_submission_envelope() - parent_cell_line_id = self._handle_parent_cell_line(submission_instance, parent_cell_line_name) + parent_cell_line_id = self._handle_parent_cell_line(submission_instance, + parent_cell_line_name) created_expression_alterations = self._handle_expression_alterations( - submission_instance, expression_alterations, expression_alterations_df, parent_cell_line_id + submission_instance, + expression_alterations, + expression_alterations_df, + parent_cell_line_name, + parent_cell_line_id ) if cell_lines and cell_lines_df is not None: @@ -207,9 +223,14 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) submission_instance, cell_lines, cell_lines_df, created_expression_alterations) if differentiated_cell_lines and differentiated_cell_lines_df is not None: - created_differentiated_cell_lines = self._create_differentiated_cell_lines( + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( submission_instance, differentiated_cell_lines, differentiated_cell_lines_df) + if (undifferentiated_cell_lines and undifferentiated_cell_lines_df is not None + and not differentiated): + created_differentiated_or_undifferentiated_cell_lines = self._create_differentiated_cell_lines( + submission_instance, undifferentiated_cell_lines, undifferentiated_cell_lines_df) + if library_preparations and library_preparations_df is not None: created_library_preparations = self._create_library_preparations( submission_instance, library_preparations, library_preparations_df) @@ -221,17 +242,22 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) updated_dfs, message = self._establish_links(submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + created_differentiated_or_undifferentiated_cell_lines, + differentiated_cell_lines_df if differentiated_cell_lines_df is not None else undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, sequencing_files_df) if message == 'SUCCESS': - self._save_and_upload_results(updated_dfs, expression_alterations_df) + self._save_and_upload_results(updated_dfs, + expression_alterations_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name) else: - return self._delete_actions(self.submission_envelope_id, submission_instance, None) + return self._delete_actions(self.submission_envelope_id, + submission_instance, + None) except ValidationError as e: print(f"Validation Error: {e.errors}") # self._delete_actions(self.submission_envelope_id, submission_instance, e) @@ -248,39 +274,52 @@ def _process_submission(self, submission_instance, list_of_files_in_upload_area) def _handle_parent_cell_line(self, submission_instance, parent_cell_line_name): """Handles the creation of a parent cell line.""" parent_cell_line_id = None + if parent_cell_line_name: print(f"Creating parental cell line with name {parent_cell_line_name}") parent_cell_line_id = self._submit_parent_cell_line(submission_instance, parent_cell_line_name) print(f"Parental cell line with name {parent_cell_line_name} created with id: {parent_cell_line_id}") + return parent_cell_line_id def _handle_expression_alterations(self, submission_instance, expression_alterations, expression_alterations_df, + parent_cell_line_name, parent_cell_line_id): """Handles the creation of expression alterations and links them to the parent cell line if needed.""" created_expression_alterations = [] + if expression_alterations and expression_alterations_df is not None: created_expression_alterations = self._submit_expression_alterations( submission_instance, expression_alterations, expression_alterations_df ) + if created_expression_alterations and parent_cell_line_id: self._link_parent_cell_line_expression_alteration( - submission_instance, self.access_token, parent_cell_line_id, created_expression_alterations + submission_instance, + self.access_token, + parent_cell_line_name, + parent_cell_line_id, + created_expression_alterations ) + return created_expression_alterations def _parse_spreadsheet(self, parser): try: # Determine the necessary sheet names tab_names = parser.list_sheets() + cell_line_sheet_name = next( (name for name in ["Cell line", "Clonal cell line"] if name in tab_names), None ) + differentiated_cell_line_sheet_name = next( (name for name in ["Differentiated cell line", "Differentiated product"] if name in tab_names), None ) + undifferentiated_cell_line_sheet_name = ( "Undifferentiated product" if "Undifferentiated product" in tab_names else None ) @@ -288,10 +327,16 @@ def _parse_spreadsheet(self, parser): undifferentiated_cell_lines = [] undifferentiated_cell_lines_df = None + differentiated_cell_lines = [] + differentiated_cell_lines_df = None + + differentiated = False + # Validate the presence of required sheets if not cell_line_sheet_name: self.validation_errors.append("Spreadsheet must contain a " "'Cell line' or 'Clonal cell line' sheet.") + if not (differentiated_cell_line_sheet_name or undifferentiated_cell_line_sheet_name): self.validation_errors.append( "Spreadsheet must contain a " @@ -303,12 +348,15 @@ def _parse_spreadsheet(self, parser): expression_alterations, expression_alterations_df = parser.get_expression_alterations( 'Expression alteration strategy', self.action, self.validation_errors ) + cell_lines, cell_lines_df, parent_cell_line_name = parser.get_cell_lines( cell_line_sheet_name, self.action, self.validation_errors ) - differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( - differentiated_cell_line_sheet_name, self.action, self.validation_errors - ) + + if differentiated_cell_line_sheet_name: + differentiated_cell_lines, differentiated_cell_lines_df = parser.get_differentiated_cell_lines( + differentiated_cell_line_sheet_name, self.action, self.validation_errors + ) if undifferentiated_cell_line_sheet_name: undifferentiated_cell_lines, undifferentiated_cell_lines_df = parser.get_undifferentiated_cell_lines( @@ -318,25 +366,36 @@ def _parse_spreadsheet(self, parser): # Check for errors and merge data if differentiated_cell_lines and undifferentiated_cell_lines: self.validation_errors.append( - "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/products" + "A spreadsheet cannot contain rows in both differentiated and undifferentiated cell lines/ products" ) if differentiated_cell_lines: + differentiated = True merge_cell_line_and_differentiated_cell_line(cell_lines, differentiated_cell_lines, self.validation_errors) - if undifferentiated_cell_lines: + + if undifferentiated_cell_lines and not differentiated: merge_cell_line_and_differentiated_cell_line(cell_lines, undifferentiated_cell_lines, self.validation_errors) library_preparations, library_preparations_df = parser.get_library_preparations( 'Library preparation', self.action, self.validation_errors ) - merge_differentiated_cell_line_and_library_preparation( - differentiated_cell_lines, library_preparations, self.validation_errors - ) + + if differentiated_cell_lines: + merge_differentiated_cell_line_and_library_preparation( + differentiated_cell_lines, library_preparations, self.validation_errors + ) + + if undifferentiated_cell_lines and not differentiated: + merge_differentiated_cell_line_and_library_preparation( + undifferentiated_cell_lines, library_preparations, self.validation_errors + ) + sequencing_files, sequencing_files_df = parser.get_sequencing_files( 'Sequence file', self.action, self.validation_errors ) + merge_library_preparation_sequencing_file(library_preparations, sequencing_files, self.validation_errors) # Return the parsed data as a dictionary @@ -354,6 +413,10 @@ def _parse_spreadsheet(self, parser): "library_preparations_df": library_preparations_df, "sequencing_files": sequencing_files, "sequencing_files_df": sequencing_files_df, + "differentiated": differentiated, + "cell_line_sheet_name": cell_line_sheet_name, + "differentiated_cell_line_sheet_name": differentiated_cell_line_sheet_name, + "undifferentiated_cell_line_sheet_name": undifferentiated_cell_line_sheet_name } except Exception: self.validation_errors.append(f"Spreadsheet is invalid {self.file}") @@ -414,7 +477,8 @@ def _create_submission_envelope(self): self.SUBMISSION_ENVELOPE_CREATE_URL, access_token=self.access_token ) if status_code in (200, 201): - self.submission_envelope_id = get_id_from_url(submission_envelope_response['_links']['self']['href']) + self.submission_envelope_id = get_entity_id_from_hal_link( + submission_envelope_response['_links']['self']['href']) print(f"Submission envelope for this submission is: {self.submission_envelope_id}") else: raise SubmissionError(f"Failed to create submission envelope. Status code: {status_code}") @@ -505,8 +569,8 @@ def _establish_links(self, submission_instance, created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, @@ -516,8 +580,8 @@ def _establish_links(self, updated_dfs, message = submission_instance.establish_links( created_cell_lines, cell_lines_df, - created_differentiated_cell_lines, - differentiated_cell_lines_df, + differentiated_or_undifferentiated_cell_lines, + differentiated_or_undifferentiated_cell_lines_df, created_library_preparations, library_preparations_df, created_sequencing_files, @@ -531,15 +595,19 @@ def _establish_links(self, return updated_dfs, message - def _save_and_upload_results(self, updated_dfs, expression_alteration_df): + def _save_and_upload_results(self, + updated_dfs, + expression_alteration_df, + cell_line_sheet_name, + differentiated_or_undifferentiated_cell_line_sheet_name): """Save the updated dataframes and upload the results.""" current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file = f"submission_result_{current_time}.xlsx" try: # List of updated DataFrames and corresponding sheet names dataframes = [ - (updated_dfs[0], 'Cell line'), - (updated_dfs[1], 'Differentiated cell line'), + (updated_dfs[0], cell_line_sheet_name), + (updated_dfs[1], differentiated_or_undifferentiated_cell_line_sheet_name), (updated_dfs[2], 'Library preparation'), (updated_dfs[3], 'Sequence file'), (expression_alteration_df, 'Expression alteration strategy') @@ -597,10 +665,11 @@ def _handle_modify_action_failure(self, error): def _link_parent_cell_line_expression_alteration(self, submission_instance, access_token, + parent_cell_line_name, parent_cell_line_id, created_expression_alterations): for expression_alteration in created_expression_alterations: - print(f"Linking parent cell line {parent_cell_line_id} " + print(f"Linking parent cell line {parent_cell_line_name} " f"as input to process of {expression_alteration.expression_alteration_id}") submission_instance.perform_hal_linkage( f"{self.BASE_URL}/biomaterials/{parent_cell_line_id}/inputToProcesses",