hed-standard · VisLab · Sep 27, 2025 · Sep 27, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/.github/workflows/ci_windows.yaml b/.github/workflows/ci_windows.yaml
@@ -39,4 +39,4 @@ jobs:
         env:
           HED_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          python -m unittest
+          python -m unittest discover tests
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+
+
 #################
 ## Eclipse
 #################
@@ -110,8 +112,12 @@ site/
 # Pycharm
 .idea/
 /venv/
+/.venv/
 config.py
 
+# VS Code
+.vscode/
+
 ############
 ## Windows
 ############
@@ -128,3 +134,6 @@ hed_cache/
 spec_tests/hed-specification/tests
 spec_tests/hed-examples
 spec_tests/*.json
+
+# GitHub Copilot instructions (project-specific)
+.github/copilot-instructions.md
diff --git a/hed/scripts/validate_bids.py b/hed/scripts/validate_bids.py
@@ -1,5 +1,24 @@
+#!/usr/bin/env python3
+"""
+Command-line script for validating BIDS datasets with HED annotations.
+
+Logging Options:
+- Default: WARNING level logs go to stderr (quiet unless there are issues)
+- --verbose or --log-level INFO: Show informational messages about progress
+- --log-level DEBUG: Show detailed debugging information
+- --log-file FILE: Save logs to a file instead of/in addition to stderr
+- --log-quiet: When using --log-file, suppress stderr output (file only)
+
+Examples:
+  validate_bids /path/to/dataset                    # Quiet validation
+  validate_bids /path/to/dataset --verbose         # Show progress
+  validate_bids /path/to/dataset --log-level DEBUG # Detailed debugging
+  validate_bids /path/to/dataset --log-file log.txt --log-quiet  # Log to file only
+"""
+
 import argparse
 import json
+import logging
 import sys
 
 def get_parser():
@@ -21,8 +40,15 @@ def get_parser():
                         help = "Optional list of suffixes (no under_bar) of tsv files to validate." +
                                " If -s with no values, will use all possible suffixes as with single argument '*'.")
 
+    parser.add_argument("-l", "--log-level", 
+                        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+                        default="WARNING", help="Log level (case insensitive). Default: WARNING")
+    parser.add_argument("-lf", "--log-file", dest="log_file", default=None,
+                        help="Full path to save log output to file. If not specified, logs go to stderr.")
+    parser.add_argument("-lq", "--log-quiet", action='store_true', dest="log_quiet",
+                        help="If present, suppress log output to stderr (only applies if --log-file is used).")
     parser.add_argument("-v", "--verbose", action='store_true',
-                        help="If present, output informative messages as computation progresses.")
+                        help="If present, output informative messages as computation progresses (equivalent to --log-level INFO).")
     parser.add_argument("-w", "--check_for_warnings", action='store_true', dest="check_for_warnings",
                         help="If present, check for warnings as well as errors.")
     parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['sourcedata', 'derivatives', 'code', 'stimuli'],
@@ -37,7 +63,51 @@ def main(arg_list=None):
 
     # Parse the arguments
     args = parser.parse_args(arg_list)
-    issue_list = validate_dataset(args)
+
+    # Setup logging configuration
+    log_level = args.log_level.upper() if args.log_level else 'WARNING'
+    if args.verbose:
+        log_level = 'INFO'
+
+    # Configure logging format
+    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    date_format = '%Y-%m-%d %H:%M:%S'
+
+    # Clear any existing handlers from root logger
+    root_logger = logging.getLogger()
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+
+    # Set the root logger level - this is crucial for filtering
+    root_logger.setLevel(getattr(logging, log_level))
+
+    # Create and configure handlers
+    formatter = logging.Formatter(log_format, datefmt=date_format)
+
+    # File handler if log file specified
+    if args.log_file:
+        file_handler = logging.FileHandler(args.log_file, mode='w', encoding='utf-8')
+        file_handler.setLevel(getattr(logging, log_level))
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
+
+    # Console handler (stderr) unless explicitly quieted and file logging is used
+    if not args.log_quiet or not args.log_file:
+        console_handler = logging.StreamHandler(sys.stderr)
+        console_handler.setLevel(getattr(logging, log_level))
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+
+    logger = logging.getLogger('validate_bids')
+    logger.info(f"Starting BIDS validation with log level: {log_level}")
+    if args.log_file:
+        logger.info(f"Log output will be saved to: {args.log_file}")
+
+    try:
+        issue_list = validate_dataset(args)
+    except Exception as e:
+        logger.exception(f"Validation failed with exception: {e}")
+        raise
 
     # Return 1 if there are issues, 0 otherwise
     return int(bool(issue_list))
@@ -49,15 +119,31 @@ def validate_dataset(args):
     from hed.tools import BidsDataset
     from hed import _version as vr
 
-    if args.verbose:
-        print(f"Data directory: {args.data_path}")
+    logger = logging.getLogger('validate_bids')
+    logger.info(f"Data directory: {args.data_path}")
+    logger.info(f"HED tools version: {str(vr.get_versions())}")
+    logger.debug(f"Exclude directories: {args.exclude_dirs}")
+    logger.debug(f"File suffixes: {args.suffixes}")
+    logger.debug(f"Check for warnings: {args.check_for_warnings}")
 
     if args.suffixes == ['*'] or args.suffixes == []:
         args.suffixes = None
+        logger.debug("Using all available suffixes")
 
     # Validate the dataset
-    bids = BidsDataset(args.data_path, suffixes=args.suffixes, exclude_dirs=args.exclude_dirs)
-    issue_list = bids.validate(check_for_warnings=args.check_for_warnings)
+    try:
+        logger.info("Creating BIDS dataset object...")
+        bids = BidsDataset(args.data_path, suffixes=args.suffixes, exclude_dirs=args.exclude_dirs)
+        logger.info(f"BIDS dataset created with schema versions: {bids.schema.get_schema_versions() if bids.schema else 'None'}")
+        logger.info(f"Found file groups: {list(bids.file_groups.keys())}")
+
+        logger.info("Starting validation...")
+        issue_list = bids.validate(check_for_warnings=args.check_for_warnings)
+        logger.info(f"Validation completed. Found {len(issue_list)} issues")
+    except Exception as e:
+        logger.error(f"Error during dataset validation: {e}")
+        logger.debug("Full exception details:", exc_info=True)
+        raise
 
     # Output based on format
     output = ""

diff --git a/hed/tools/bids/bids_dataset.py b/hed/tools/bids/bids_dataset.py
@@ -1,6 +1,7 @@
 """ The contents of a BIDS dataset. """
 
 import os
+import logging
 from hed.schema.hed_schema import HedSchema
 from hed.schema.hed_schema_group import HedSchemaGroup
 from hed.tools.bids.bids_file_group import BidsFileGroup
@@ -30,16 +31,32 @@ def __init__(self, root_path, schema=None, suffixes=['events', 'participants'],
             exclude_dirs=['sourcedata', 'derivatives', 'code', 'phenotype']:
 
         """
+        logger = logging.getLogger('hed.bids_dataset')
+        logger.debug(f"Initializing BidsDataset for path: {root_path}")
+
         self.root_path = os.path.realpath(root_path)
+        logger.debug(f"Real root path resolved to: {self.root_path}")
+
         if schema:
             self.schema = schema
+            logger.debug(f"Using provided schema: {schema.get_schema_versions() if hasattr(schema, 'get_schema_versions') else 'custom'}")
         else:
+            logger.debug("Loading schema from dataset description...")
             self.schema = bids_util.get_schema_from_description(self.root_path)
+            if self.schema:
+                logger.info(f"Loaded schema from dataset: {self.schema.get_schema_versions()}")
+            else:
+                logger.warning("No valid schema found in dataset description")
 
         self.exclude_dirs = exclude_dirs
         self.suffixes = suffixes
+        logger.debug(f"Using suffixes: {suffixes}, excluding directories: {exclude_dirs}")
+
+        logger.info("Setting up file groups...")
         self.file_groups = self._set_file_groups()
         self.bad_files = []
+
+        logger.info(f"BidsDataset initialized with {len(self.file_groups)} file groups: {list(self.file_groups.keys())}")
 
     def get_file_group(self, suffix):
         """ Return the file group of files with the specified suffix.
@@ -64,17 +81,32 @@ def validate(self, check_for_warnings=False, schema=None):
             list:  List of issues encountered during validation. Each issue is a dictionary.
 
         """
+        logger = logging.getLogger('hed.bids_dataset')
+        logger.info(f"Starting validation of {len(self.file_groups)} file groups")
+        logger.debug(f"Check for warnings: {check_for_warnings}")
+
         issues = []
         if schema:
             this_schema = schema
+            logger.debug("Using provided schema for validation")
         elif self.schema:
             this_schema = self.schema
+            logger.debug(f"Using dataset schema for validation: {this_schema.get_schema_versions()}")
         else:
+            logger.error("No valid schema available for validation")
             return [{"code": "SCHEMA_LOAD_FAILED",
                      "message": "BIDS dataset_description.json has invalid HEDVersion and passed schema was invalid}"}]
+
         for suffix, group in self.file_groups.items():
             if group.has_hed:
-                issues += group.validate(this_schema, check_for_warnings=check_for_warnings)
+                logger.info(f"Validating file group: {suffix} ({len(group.datafile_dict)} files)")
+                group_issues = group.validate(this_schema, check_for_warnings=check_for_warnings)
+                logger.info(f"File group {suffix} validation completed: {len(group_issues)} issues found")
+                issues += group_issues
+            else:
+                logger.debug(f"Skipping file group {suffix} - no HED content")
+
+        logger.info(f"Dataset validation completed: {len(issues)} total issues found")
         return issues
 
     def get_summary(self):
@@ -85,15 +117,26 @@ def get_summary(self):
         return summary
 
     def _set_file_groups(self):
+        logger = logging.getLogger('hed.bids_dataset')
+        logger.debug(f"Searching for files with extensions ['.tsv', '.json'] and suffixes {self.suffixes}")
+
         file_paths = io_util.get_file_list(self.root_path, extensions=['.tsv', '.json'],
                                            exclude_dirs=self.exclude_dirs, name_suffix=self.suffixes)
+        logger.debug(f"Found {len(file_paths)} files matching criteria")
+
         file_dict = bids_util.group_by_suffix(file_paths)
+        logger.debug(f"Files grouped by suffix: {[(suffix, len(files)) for suffix, files in file_dict.items()]}")
 
         file_groups = {}
         for suffix, files in file_dict.items():
+            logger.debug(f"Creating file group for suffix '{suffix}' with {len(files)} files")
             file_group = BidsFileGroup.create_file_group(self.root_path, files, suffix)
             if file_group:
                 file_groups[suffix] = file_group
+                logger.debug(f"Successfully created file group for '{suffix}'")
+            else:
+                logger.warning(f"Failed to create file group for suffix '{suffix}'")
 
         self.suffixes = list(file_groups.keys())
+        logger.info(f"Created {len(file_groups)} file groups: {list(file_groups.keys())}")
         return file_groups
diff --git a/hed/tools/bids/bids_file_group.py b/hed/tools/bids/bids_file_group.py
@@ -1,6 +1,7 @@
 """ A group of BIDS files with specified suffix name. """
 
 import os
+import logging
 import pandas as pd
 
 from hed.errors.error_reporter import ErrorHandler
@@ -30,17 +31,31 @@ def __init__(self, root_path, file_list, suffix="events"):
             file_list (list):  List of paths to the relevant tsv and json files.
             suffix (str):     Suffix indicating the type this group represents (e.g. events, or channels, etc.).
         """
-
+        logger = logging.getLogger('hed.bids_file_group')
+        logger.debug(f"Creating BidsFileGroup for suffix '{suffix}' with {len(file_list)} files")
+
         self.suffix = suffix
         ext_dict = io_util.separate_by_ext(file_list)
+        logger.debug(f"Files by extension: .json={len(ext_dict.get('.json', []))}, .tsv={len(ext_dict.get('.tsv', []))}")
+
         self.bad_files = {}
         self.sidecar_dict = {}
         self.sidecar_dir_dict = {}
         self.datafile_dict = {}
         self.has_hed = False
+
+        logger.debug(f"Processing {len(ext_dict.get('.json', []))} JSON sidecar files...")
         self._make_sidecar_dict(ext_dict.get('.json', []))
+
+        logger.debug("Creating directory mapping...")
         self._make_dir_dict(root_path)
+
+        logger.debug(f"Processing {len(ext_dict.get('.tsv', []))} TSV data files...")
         self._make_datafile_dict(root_path, ext_dict.get('.tsv', []))
+
+        logger.info(f"BidsFileGroup '{suffix}' created: {len(self.sidecar_dict)} sidecars, {len(self.datafile_dict)} data files, has_hed={self.has_hed}")
+        if self.bad_files:
+            logger.warning(f"Found {len(self.bad_files)} bad files in group '{suffix}'")
 
     def summarize(self, value_cols=None, skip_cols=None):
         """ Return a BidsTabularSummary of group files.
@@ -72,11 +87,23 @@ def validate(self, hed_schema, extra_def_dicts=None, check_for_warnings=False):
         Returns:
             list:  A list of validation issues found. Each issue is a dictionary.
         """
+        logger = logging.getLogger('hed.bids_file_group')
+        logger.info(f"Starting validation of file group '{self.suffix}' (sidecars: {len(self.sidecar_dict)}, data files: {len(self.datafile_dict)})")
+
         error_handler = ErrorHandler(check_for_warnings)
         issues = []
-        issues += self.validate_sidecars(hed_schema, extra_def_dicts=extra_def_dicts,  error_handler=error_handler)
-        issues += self.validate_datafiles(hed_schema, extra_def_dicts=extra_def_dicts,
-                                          error_handler=error_handler)
+
+        logger.debug(f"Validating {len(self.sidecar_dict)} sidecars...")
+        sidecar_issues = self.validate_sidecars(hed_schema, extra_def_dicts=extra_def_dicts,  error_handler=error_handler)
+        logger.info(f"Sidecar validation completed: {len(sidecar_issues)} issues found")
+        issues += sidecar_issues
+
+        logger.debug(f"Validating {len([f for f in self.datafile_dict.values() if f.has_hed])} HED-enabled data files...")
+        datafile_issues = self.validate_datafiles(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler)
+        logger.info(f"Data file validation completed: {len(datafile_issues)} issues found")
+        issues += datafile_issues
+
+        logger.info(f"File group '{self.suffix}' validation completed: {len(issues)} total issues")
         return issues
 
     def validate_sidecars(self, hed_schema, extra_def_dicts=None, error_handler=None):
@@ -113,19 +140,31 @@ def validate_datafiles(self, hed_schema, extra_def_dicts=None, error_handler=Non
 
         Notes: This will clear the contents of the datafiles if they were not previously set.
         """
-
+        logger = logging.getLogger('hed.bids_file_group')
+
         if not error_handler:
             error_handler = ErrorHandler(False)
         issues = []
-        for data_obj in self.datafile_dict.values():
-            if not data_obj.has_hed:
-                continue
+
+        hed_files = [f for f in self.datafile_dict.values() if f.has_hed]
+        logger.debug(f"Processing {len(hed_files)} out of {len(self.datafile_dict)} data files with HED annotations")
+
+        for i, data_obj in enumerate(hed_files, 1):
+            logger.debug(f"Validating data file {i}/{len(hed_files)}: {os.path.basename(data_obj.file_path)}")
+
             had_contents = data_obj.contents
             data_obj.set_contents(overwrite=False)
-            issues += data_obj.contents.validate(hed_schema, extra_def_dicts=extra_def_dicts, name=data_obj.file_path,
-                                                 error_handler=error_handler)
+            file_issues = data_obj.contents.validate(hed_schema, extra_def_dicts=extra_def_dicts, name=data_obj.file_path,
+                                                     error_handler=error_handler)
+
+            if file_issues:
+                logger.debug(f"File {os.path.basename(data_obj.file_path)}: {len(file_issues)} issues found")
+            issues += file_issues
+
             if not had_contents:
                 data_obj.clear_contents()
+
+        logger.debug(f"Data file validation completed: {len(issues)} total issues from {len(hed_files)} files")
         return issues
 
     def _make_dir_dict(self, root_path):
@@ -254,7 +293,14 @@ def _make_sidecar_dict(self, json_files):
 
     @staticmethod
     def create_file_group(root_path, file_list, suffix):
+        logger = logging.getLogger('hed.bids_file_group')
+        logger.debug(f"Creating file group for suffix '{suffix}' from {len(file_list)} files")
+
         file_group = BidsFileGroup(root_path, file_list, suffix=suffix)
+
         if not file_group.sidecar_dict and not file_group.datafile_dict:
+            logger.debug(f"File group '{suffix}' is empty (no sidecars or data files), returning None")
             return None
+
+        logger.debug(f"File group '{suffix}' created successfully")
         return file_group