ebi-ait · KociOrges · Oct 1, 2024 · Oct 3, 2024 · Oct 8, 2024 · Oct 18, 2024
diff --git a/README.md b/README.md
@@ -38,7 +38,8 @@ command:
   {config,submit,submit-file,create,select,list,upload,download,delete}
     config              configure AWS credentials
     submit              submit your study, dataset or biomaterials metadata (incomplete as all metadata types is not supported yet, expected to be completed on August 2024)
-    submit-file         submit your metadata file containing your cell lines, differentiated cell lines, library preparations and sequencing files
+    submit-file         submit your metadata file with cell lines, differentiated products, library preparations, sequencing files, 
+                        and optionally context-specific data (e.g., pooled or unperturbed experiments)    
     create              create an upload area (authorised users only)
     select              select or show the active upload area
     list                list contents of the area
@@ -59,6 +60,16 @@ Use the tool by specifying a command (`cmd` - see list below) to run, any mandat
 and `ARG2` - see positional args for each command), and any optional arguments (e.g. `-o1` and `o2` - see options for
 each command).
 
+### What’s new
+
+**Automatic clonal-cell-line reuse** – if a clonal cell-line label in your
+spreadsheet already exists in the ingest database, `morphic-util` will detect
+it and link to the existing record instead of creating a duplicate.
+
+**Configurable ingest endpoint** – set the environment variable
+`INGEST_API_BASE` in `spreadsheet_util.py:parse_cell_lines` (defaults to `https://api.ingest.archive.morphic.bio`) to
+target a different ingest deployment without editing code.
+
 ## Commands
 
 Help with specific command:
@@ -88,22 +99,48 @@ Submit your study and dataset metadata and create your AWS upload area for uploa
 
 ```shell script
 positional arguments:
-$ morphic-util submit --type <TYPE> --file <PATH_TO_FILE>
-
-  --type         type of metadata being submitted (e.g. study or dataset)
-  --file         path to the file containing the metadata
+$ morphic-util submit --type <TYPE> --file <PATH_TO_FILE> [--study <STUDY_ID>] --dataset-type <DATASET_TYPE> [--derived-from <PARENT_IDS>]
+
+  Required:
+    --type: type of metadata being submitted (e.g. study or dataset)
+    --file: path to the file containing the metadata
+
+  Required for datasets:
+    --dataset-type: Dataset type (e.g., raw, filtered, processed, analysis)
+
+  Conditionally required for datasets:
+    --derived-from: Comma-separated list of dataset IDs this dataset is derived from
+
+  Optional (for datasets):
+    --study: Link the dataset to an existing study
+
+  Validation rules (for datasets):
+    raw: Must not include --derived-from
+    filtered, processed: Must be derived from a raw dataset
+    analysis: Must be derived from a processed dataset
 ```
 
 ## `submit-file` command
 Submit your study and dataset metadata and create your AWS upload area for uploading data files
 
 ```shell script
 positional arguments:
-$ morphic-util submit-file --file <PATH_TO_FILE> --action <SUBMISSION_ACTION> --dataset <the analyis which has generated the data and the metadata>
+$ morphic-util submit-file --file <PATH_TO_FILE> --action <SUBMISSION_ACTION> --dataset <the analyis which has generated the data and the metadata> [--context <CONTEXT>]
 
+positional arguments:
   --file         path to the file containing the metadata
   --action       ADD, MODIFY or DELETE based on the type of submission
   --dataset      the identifier for the analysis
+
+optional arguments:
+  --context      optional ingestion context, e.g.:
+                   'pooled_differentiated' → for MSK pooled datasets
+                   'unperturbed_multiple' → for UCSF datasets
+                 If omitted, legacy behavior is used
+```
+Example usage:
+```shell script
+morphic-util submit-file --file my_file.xlsx --action ADD --dataset 67f8519e68005a3744c40fcf --context pooled_differentiated
 ```
 
 ## `create` command
@@ -205,11 +242,17 @@ $ morphic-util submit --type study --file <PATH_TO_STUDY_METADATA_FILE>
 ### Create your dataset and link it to your study
 ```shell script
 positional arguments:
-$ morphic-util submit --type dataset --file <PATH_TO_DATASET_METADATA_FILE> --study <STUDY_ID>
-
+$ morphic-util submit --type dataset --file <PATH_TO_DATASET_METADATA_FILE> [--study <STUDY_ID>] [--dataset-type <TYPE>] [--derived-from <PARENT_IDS>]
   --type         type of metadata being submitted (here it is dataset)
   --file         path to the file containing the metadata (optional)
   --study        STUDY_ID obtained in the last step
+  --dataset-type: One of raw, filtered, processed, or analysis (required)
+  --derived-from: Comma-separated list of dataset IDs this dataset is derived from (required for all except raw)
+
+  Validation rules:
+    raw: Must not have --derived-from
+    filtered or processed: Must be derived from raw
+    analysis: Must be derived from one or more processed datasets
 ```
 ### `select` your upload area to upload your data files (the upload area name is same as your DATASET_ID)
 Show or select the data file upload area

diff --git a/ait/commons/util/__main__.py b/ait/commons/util/__main__.py
@@ -79,17 +79,27 @@ def parse_args(args):
     parser_config.add_argument('PASSWORD', help='AWS Cognito password', nargs='?')
     parser_config.add_argument('--bucket', help='use BUCKET instead of default bucket')
 
-    parser_config = cmd_parser.add_parser('submit', help='submit your metadata')
-    parser_config.add_argument('--type', help='data type you are submitting, e.g. study, dataset')
-    parser_config.add_argument('--file', help='your metadata')
-    parser_config.add_argument('--study', help='your study reference')
-    parser_config.add_argument('--dataset', help='your dataset reference')
-    parser_config.add_argument('--process', help='your process/analysis reference')
+    parser_submit = cmd_parser.add_parser('submit', help='submit your metadata')
+    parser_submit.add_argument('--type', required=True, choices=['study', 'dataset'], help='data type you are submitting')
+    parser_submit.add_argument('--file', required=True, help='your metadata file path')
+    parser_submit.add_argument('--study', help='your study reference')
+    parser_submit.add_argument('--dataset', help='your dataset reference')
+    parser_submit.add_argument('--process', help='your process/analysis reference')
+    parser_submit.add_argument('--dataset-type', choices=['raw', 'processed', 'filtered', 'analysis'],
+                               help='dataset type (required if --type=dataset)')
+    parser_submit.add_argument('--derived-from', help='Comma-separated dataset IDs this dataset is derived from')
 
     parser_config = cmd_parser.add_parser('submit-file', help='submit your file containing your dataset metadata')
     parser_config.add_argument('--file', help='spreadsheet containing your dataset metadata')
     parser_config.add_argument('--action', help='action you want to perform (ADD/MODIFY/DELETE')
     parser_config.add_argument('--dataset', help='your dataset reference')
+    parser_config.add_argument(
+        '--context',
+        help="Optional context for ingestion (e.g. 'pooled_differentiated' for MSK pooled mode or "
+             "'unperturbed_multiple' for UCSF mode)."
+             "If omitted, legacy behavior is used.",
+        default=None
+    )
 
     parser_config = cmd_parser.add_parser('view', help='view your dataset')
     parser_config.add_argument('--dataset', help='your dataset reference')
@@ -113,7 +123,8 @@ def parse_args(args):
     # parser_clear.add_argument('-a', action='store_true', help='clear all - selection and known dirs')
 
     parser_list = cmd_parser.add_parser('list', help='list contents of the area')
-    parser_list.add_argument('-b', action='store_true', help='list all areas in the S3 bucket (authorised users only)')
+    parser_list.add_argument('-processing', action='store_true', help='access the processed data (authorised users '
+                                                                      'only)')
 
     # parser_upload = cmd_parser.add_parser('upload', help='upload files to the area')
     # group_upload = parser_upload.add_mutually_exclusive_group(required=True)
@@ -143,7 +154,8 @@ def parse_args(args):
     group_delete.add_argument('-d', action='store_true', help='delete upload area and contents (authorised users only)')
 
     parser_sync = cmd_parser.add_parser('sync',
-                                        help='copy data from selected upload area to ingest upload area (authorised users only)')
+                                        help='copy data from selected upload area to ingest upload area (authorised '
+                                             'users only)')
     parser_sync.add_argument('INGEST_UPLOAD_AREA', help='Ingest upload area', type=valid_ingest_upload_area)
 
     ps = [parser]
@@ -170,6 +182,24 @@ def parse_args(args):
 def main():
     try:
         parsed_args = parse_args(sys.argv[1:])
+
+        if parsed_args.command == 'submit' and parsed_args.type == 'dataset':
+            if not parsed_args.dataset_type:
+                print("Error: --dataset-type is required when submitting a dataset", file=sys.stderr)
+                sys.exit(1)
+
+            if parsed_args.dataset_type == 'raw' and parsed_args.derived_from:
+                print("Error: --derived-from is not allowed for 'raw' datasets", file=sys.stderr)
+                sys.exit(1)
+
+            if parsed_args.dataset_type in ['processed', 'filtered'] and not parsed_args.derived_from:
+                print("Error: --derived-from is required for 'processed' or 'filtered' datasets", file=sys.stderr)
+                sys.exit(1)
+
+            if parsed_args.dataset_type == 'analysis' and not parsed_args.derived_from:
+                print("Error: --derived-from is required for 'analysis' datasets", file=sys.stderr)
+                sys.exit(1)
+
         Cmd(parsed_args)
     except KeyboardInterrupt:
         # If SIGINT is triggered whilst threads are active (upload/download) we kill the entire process to give the

diff --git a/ait/commons/util/command/list.py b/ait/commons/util/command/list.py
@@ -1,5 +1,10 @@
+import hashlib
+import csv
+
 from ait.commons.util.common import format_err
 from ait.commons.util.local_state import get_selected_area
+from ait.commons.util.user_profile import get_profile
+from urllib.parse import urlparse
 
 
 def print_area(k, area):
@@ -20,6 +25,34 @@ def print_area(k, area):
     print()
 
 
+def get_s3_path():
+    while True:
+        s3_path = input("Enter the S3 path (e.g., s3://bucket-name/folder/): ").strip()
+        parsed_url = urlparse(s3_path)
+
+        if parsed_url.scheme == 's3' and parsed_url.netloc:
+            return s3_path
+        else:
+            print("Invalid S3 path. Please enter a valid S3 path starting with 's3://'.")
+
+
+def calculate_md5(s3_client, bucket_name, key):
+    md5_hash = hashlib.md5()
+
+    try:
+        # Stream the object in chunks
+        response = s3_client.get_object(Bucket=bucket_name, Key=key)
+
+        for chunk in response['Body'].iter_chunks(chunk_size=8192):
+            md5_hash.update(chunk)
+
+        return md5_hash.hexdigest()
+    except Exception as e:
+        print(f"Failed to compute MD5 for {key}: {e}")
+
+        return None
+
+
 class CmdList:
     """
     admin and user
@@ -29,22 +62,67 @@ class CmdList:
     def __init__(self, aws, args):
         self.aws = aws
         self.args = args
+        self.user = get_profile('morphic-util').username
+        self.processing = getattr(self.args, 'processing', None)
 
         self.s3_cli = self.aws.common_session.client('s3')
 
     def run(self):
-        selected_area = get_selected_area()  # select area is a S3 bucket
+        if self.processing:
+            if self.user != 'morphic-admin':
+                return False, "Admin function only"
+            else:
+                print("Access granted")
+
+                s3_path = get_s3_path()
+                self.list_s3_files(s3_path)
+
+                return True, None
+
+        else:
+            selected_area = get_selected_area()  # select area is a S3 bucket
+
+            if not selected_area:
+                return False, 'No area selected'
+
+            try:
+                self.list_bucket_contents(selected_area)
+                # print_count(folder_count + files_count)
+                return True, None
+
+            except Exception as e:
+                return False, format_err(e, 'list')
+
+    def list_s3_files(self, s3_path):
+        parsed_url = urlparse(s3_path)
+        bucket_name = parsed_url.netloc
+        prefix = parsed_url.path.lstrip('/')
+        output_file = 's3_file_md5s.tsv'
+
+        with open(output_file, 'w', newline='') as csvfile:
+            tsv_writer = csv.writer(csvfile, delimiter=',')
+            tsv_writer.writerow(['File Name', 'MD5 Hash'])  # Write header row
+
+            try:
+                response = self.s3_cli.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+
+                if 'Contents' in response:
+                    print(f"\nFiles in '{s3_path}'")
 
-        if not selected_area:
-            return False, 'No area selected'
+                    for obj in response['Contents']:
+                        file_key = obj['Key']
+                        if not file_key.endswith('/'):  # Skip folders
+                            md5_hash = calculate_md5(self.s3_cli, bucket_name, file_key)
 
-        try:
-            self.list_bucket_contents(selected_area)
-            # print_count(folder_count + files_count)
-            return True, None
+                            if md5_hash:
+                                print(f"{file_key} - MD5: {md5_hash}")
+                                tsv_writer.writerow([file_key, md5_hash])  # Write to file
+                else:
+                    print("\nNo files found.")
+            except Exception as e:
+                print(f"\nError: {e}")
 
-        except Exception as e:
-            return False, format_err(e, 'list')
+        print(f"\nResults saved to {output_file}")
 
     def list_bucket_contents(self, selected_area, prefix=''):
         result = self.s3_cli.list_objects_v2(Bucket=selected_area, Delimiter='/', Prefix=prefix)