Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci_windows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ jobs:
env:
HED_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m unittest
python -m unittest discover tests
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


#################
## Eclipse
#################
Expand Down Expand Up @@ -110,8 +112,12 @@ site/
# Pycharm
.idea/
/venv/
/.venv/
config.py

# VS Code
.vscode/

############
## Windows
############
Expand All @@ -128,3 +134,6 @@ hed_cache/
spec_tests/hed-specification/tests
spec_tests/hed-examples
spec_tests/*.json

# GitHub Copilot instructions (project-specific)
.github/copilot-instructions.md
98 changes: 92 additions & 6 deletions hed/scripts/validate_bids.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
#!/usr/bin/env python3
"""
Command-line script for validating BIDS datasets with HED annotations.

Logging Options:
- Default: WARNING level logs go to stderr (quiet unless there are issues)
- --verbose or --log-level INFO: Show informational messages about progress
- --log-level DEBUG: Show detailed debugging information
- --log-file FILE: Save logs to a file instead of/in addition to stderr
- --log-quiet: When using --log-file, suppress stderr output (file only)

Examples:
validate_bids /path/to/dataset # Quiet validation
validate_bids /path/to/dataset --verbose # Show progress
validate_bids /path/to/dataset --log-level DEBUG # Detailed debugging
validate_bids /path/to/dataset --log-file log.txt --log-quiet # Log to file only
"""

import argparse
import json
import logging
import sys

def get_parser():
Expand All @@ -21,8 +40,15 @@ def get_parser():
help = "Optional list of suffixes (no under_bar) of tsv files to validate." +
" If -s with no values, will use all possible suffixes as with single argument '*'.")

parser.add_argument("-l", "--log-level",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="WARNING", help="Log level (case insensitive). Default: WARNING")
parser.add_argument("-lf", "--log-file", dest="log_file", default=None,
help="Full path to save log output to file. If not specified, logs go to stderr.")
parser.add_argument("-lq", "--log-quiet", action='store_true', dest="log_quiet",
help="If present, suppress log output to stderr (only applies if --log-file is used).")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
help="If present, output informative messages as computation progresses (equivalent to --log-level INFO).")
parser.add_argument("-w", "--check_for_warnings", action='store_true', dest="check_for_warnings",
help="If present, check for warnings as well as errors.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['sourcedata', 'derivatives', 'code', 'stimuli'],
Expand All @@ -37,7 +63,51 @@ def main(arg_list=None):

# Parse the arguments
args = parser.parse_args(arg_list)
issue_list = validate_dataset(args)

# Setup logging configuration
log_level = args.log_level.upper() if args.log_level else 'WARNING'
if args.verbose:
log_level = 'INFO'

# Configure logging format
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
date_format = '%Y-%m-%d %H:%M:%S'

# Clear any existing handlers from root logger
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)

# Set the root logger level - this is crucial for filtering
root_logger.setLevel(getattr(logging, log_level))

# Create and configure handlers
formatter = logging.Formatter(log_format, datefmt=date_format)

# File handler if log file specified
if args.log_file:
file_handler = logging.FileHandler(args.log_file, mode='w', encoding='utf-8')
file_handler.setLevel(getattr(logging, log_level))
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)

# Console handler (stderr) unless explicitly quieted and file logging is used
if not args.log_quiet or not args.log_file:
console_handler = logging.StreamHandler(sys.stderr)
console_handler.setLevel(getattr(logging, log_level))
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)

logger = logging.getLogger('validate_bids')
logger.info(f"Starting BIDS validation with log level: {log_level}")
if args.log_file:
logger.info(f"Log output will be saved to: {args.log_file}")

try:
issue_list = validate_dataset(args)
except Exception as e:
logger.exception(f"Validation failed with exception: {e}")
raise

# Return 1 if there are issues, 0 otherwise
return int(bool(issue_list))
Expand All @@ -49,15 +119,31 @@ def validate_dataset(args):
from hed.tools import BidsDataset
from hed import _version as vr

if args.verbose:
print(f"Data directory: {args.data_path}")
logger = logging.getLogger('validate_bids')
logger.info(f"Data directory: {args.data_path}")
logger.info(f"HED tools version: {str(vr.get_versions())}")
logger.debug(f"Exclude directories: {args.exclude_dirs}")
logger.debug(f"File suffixes: {args.suffixes}")
logger.debug(f"Check for warnings: {args.check_for_warnings}")

if args.suffixes == ['*'] or args.suffixes == []:
args.suffixes = None
logger.debug("Using all available suffixes")

# Validate the dataset
bids = BidsDataset(args.data_path, suffixes=args.suffixes, exclude_dirs=args.exclude_dirs)
issue_list = bids.validate(check_for_warnings=args.check_for_warnings)
try:
logger.info("Creating BIDS dataset object...")
bids = BidsDataset(args.data_path, suffixes=args.suffixes, exclude_dirs=args.exclude_dirs)
logger.info(f"BIDS dataset created with schema versions: {bids.schema.get_schema_versions() if bids.schema else 'None'}")
logger.info(f"Found file groups: {list(bids.file_groups.keys())}")

logger.info("Starting validation...")
issue_list = bids.validate(check_for_warnings=args.check_for_warnings)
logger.info(f"Validation completed. Found {len(issue_list)} issues")
except Exception as e:
logger.error(f"Error during dataset validation: {e}")
logger.debug("Full exception details:", exc_info=True)
raise

# Output based on format
output = ""
Expand Down
45 changes: 44 additions & 1 deletion hed/tools/bids/bids_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" The contents of a BIDS dataset. """

import os
import logging
from hed.schema.hed_schema import HedSchema
from hed.schema.hed_schema_group import HedSchemaGroup
from hed.tools.bids.bids_file_group import BidsFileGroup
Expand Down Expand Up @@ -30,16 +31,32 @@ def __init__(self, root_path, schema=None, suffixes=['events', 'participants'],
exclude_dirs=['sourcedata', 'derivatives', 'code', 'phenotype']:

"""
logger = logging.getLogger('hed.bids_dataset')
logger.debug(f"Initializing BidsDataset for path: {root_path}")

self.root_path = os.path.realpath(root_path)
logger.debug(f"Real root path resolved to: {self.root_path}")

if schema:
self.schema = schema
logger.debug(f"Using provided schema: {schema.get_schema_versions() if hasattr(schema, 'get_schema_versions') else 'custom'}")
else:
logger.debug("Loading schema from dataset description...")
self.schema = bids_util.get_schema_from_description(self.root_path)
if self.schema:
logger.info(f"Loaded schema from dataset: {self.schema.get_schema_versions()}")
else:
logger.warning("No valid schema found in dataset description")

self.exclude_dirs = exclude_dirs
self.suffixes = suffixes
logger.debug(f"Using suffixes: {suffixes}, excluding directories: {exclude_dirs}")

logger.info("Setting up file groups...")
self.file_groups = self._set_file_groups()
self.bad_files = []

logger.info(f"BidsDataset initialized with {len(self.file_groups)} file groups: {list(self.file_groups.keys())}")

def get_file_group(self, suffix):
""" Return the file group of files with the specified suffix.
Expand All @@ -64,17 +81,32 @@ def validate(self, check_for_warnings=False, schema=None):
list: List of issues encountered during validation. Each issue is a dictionary.

"""
logger = logging.getLogger('hed.bids_dataset')
logger.info(f"Starting validation of {len(self.file_groups)} file groups")
logger.debug(f"Check for warnings: {check_for_warnings}")

issues = []
if schema:
this_schema = schema
logger.debug("Using provided schema for validation")
elif self.schema:
this_schema = self.schema
logger.debug(f"Using dataset schema for validation: {this_schema.get_schema_versions()}")
else:
logger.error("No valid schema available for validation")
return [{"code": "SCHEMA_LOAD_FAILED",
"message": "BIDS dataset_description.json has invalid HEDVersion and passed schema was invalid}"}]

for suffix, group in self.file_groups.items():
if group.has_hed:
issues += group.validate(this_schema, check_for_warnings=check_for_warnings)
logger.info(f"Validating file group: {suffix} ({len(group.datafile_dict)} files)")
group_issues = group.validate(this_schema, check_for_warnings=check_for_warnings)
logger.info(f"File group {suffix} validation completed: {len(group_issues)} issues found")
issues += group_issues
else:
logger.debug(f"Skipping file group {suffix} - no HED content")

logger.info(f"Dataset validation completed: {len(issues)} total issues found")
return issues

def get_summary(self):
Expand All @@ -85,15 +117,26 @@ def get_summary(self):
return summary

def _set_file_groups(self):
logger = logging.getLogger('hed.bids_dataset')
logger.debug(f"Searching for files with extensions ['.tsv', '.json'] and suffixes {self.suffixes}")

file_paths = io_util.get_file_list(self.root_path, extensions=['.tsv', '.json'],
exclude_dirs=self.exclude_dirs, name_suffix=self.suffixes)
logger.debug(f"Found {len(file_paths)} files matching criteria")

file_dict = bids_util.group_by_suffix(file_paths)
logger.debug(f"Files grouped by suffix: {[(suffix, len(files)) for suffix, files in file_dict.items()]}")

file_groups = {}
for suffix, files in file_dict.items():
logger.debug(f"Creating file group for suffix '{suffix}' with {len(files)} files")
file_group = BidsFileGroup.create_file_group(self.root_path, files, suffix)
if file_group:
file_groups[suffix] = file_group
logger.debug(f"Successfully created file group for '{suffix}'")
else:
logger.warning(f"Failed to create file group for suffix '{suffix}'")

self.suffixes = list(file_groups.keys())
logger.info(f"Created {len(file_groups)} file groups: {list(file_groups.keys())}")
return file_groups
66 changes: 56 additions & 10 deletions hed/tools/bids/bids_file_group.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" A group of BIDS files with specified suffix name. """

import os
import logging
import pandas as pd

from hed.errors.error_reporter import ErrorHandler
Expand Down Expand Up @@ -30,17 +31,31 @@ def __init__(self, root_path, file_list, suffix="events"):
file_list (list): List of paths to the relevant tsv and json files.
suffix (str): Suffix indicating the type this group represents (e.g. events, or channels, etc.).
"""

logger = logging.getLogger('hed.bids_file_group')
logger.debug(f"Creating BidsFileGroup for suffix '{suffix}' with {len(file_list)} files")

self.suffix = suffix
ext_dict = io_util.separate_by_ext(file_list)
logger.debug(f"Files by extension: .json={len(ext_dict.get('.json', []))}, .tsv={len(ext_dict.get('.tsv', []))}")

self.bad_files = {}
self.sidecar_dict = {}
self.sidecar_dir_dict = {}
self.datafile_dict = {}
self.has_hed = False

logger.debug(f"Processing {len(ext_dict.get('.json', []))} JSON sidecar files...")
self._make_sidecar_dict(ext_dict.get('.json', []))

logger.debug("Creating directory mapping...")
self._make_dir_dict(root_path)

logger.debug(f"Processing {len(ext_dict.get('.tsv', []))} TSV data files...")
self._make_datafile_dict(root_path, ext_dict.get('.tsv', []))

logger.info(f"BidsFileGroup '{suffix}' created: {len(self.sidecar_dict)} sidecars, {len(self.datafile_dict)} data files, has_hed={self.has_hed}")
if self.bad_files:
logger.warning(f"Found {len(self.bad_files)} bad files in group '{suffix}'")

def summarize(self, value_cols=None, skip_cols=None):
""" Return a BidsTabularSummary of group files.
Expand Down Expand Up @@ -72,11 +87,23 @@ def validate(self, hed_schema, extra_def_dicts=None, check_for_warnings=False):
Returns:
list: A list of validation issues found. Each issue is a dictionary.
"""
logger = logging.getLogger('hed.bids_file_group')
logger.info(f"Starting validation of file group '{self.suffix}' (sidecars: {len(self.sidecar_dict)}, data files: {len(self.datafile_dict)})")

error_handler = ErrorHandler(check_for_warnings)
issues = []
issues += self.validate_sidecars(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler)
issues += self.validate_datafiles(hed_schema, extra_def_dicts=extra_def_dicts,
error_handler=error_handler)

logger.debug(f"Validating {len(self.sidecar_dict)} sidecars...")
sidecar_issues = self.validate_sidecars(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler)
logger.info(f"Sidecar validation completed: {len(sidecar_issues)} issues found")
issues += sidecar_issues

logger.debug(f"Validating {len([f for f in self.datafile_dict.values() if f.has_hed])} HED-enabled data files...")
datafile_issues = self.validate_datafiles(hed_schema, extra_def_dicts=extra_def_dicts, error_handler=error_handler)
logger.info(f"Data file validation completed: {len(datafile_issues)} issues found")
issues += datafile_issues

logger.info(f"File group '{self.suffix}' validation completed: {len(issues)} total issues")
return issues

def validate_sidecars(self, hed_schema, extra_def_dicts=None, error_handler=None):
Expand Down Expand Up @@ -113,19 +140,31 @@ def validate_datafiles(self, hed_schema, extra_def_dicts=None, error_handler=Non

Notes: This will clear the contents of the datafiles if they were not previously set.
"""

logger = logging.getLogger('hed.bids_file_group')

if not error_handler:
error_handler = ErrorHandler(False)
issues = []
for data_obj in self.datafile_dict.values():
if not data_obj.has_hed:
continue

hed_files = [f for f in self.datafile_dict.values() if f.has_hed]
logger.debug(f"Processing {len(hed_files)} out of {len(self.datafile_dict)} data files with HED annotations")

for i, data_obj in enumerate(hed_files, 1):
logger.debug(f"Validating data file {i}/{len(hed_files)}: {os.path.basename(data_obj.file_path)}")

had_contents = data_obj.contents
data_obj.set_contents(overwrite=False)
issues += data_obj.contents.validate(hed_schema, extra_def_dicts=extra_def_dicts, name=data_obj.file_path,
error_handler=error_handler)
file_issues = data_obj.contents.validate(hed_schema, extra_def_dicts=extra_def_dicts, name=data_obj.file_path,
error_handler=error_handler)

if file_issues:
logger.debug(f"File {os.path.basename(data_obj.file_path)}: {len(file_issues)} issues found")
issues += file_issues

if not had_contents:
data_obj.clear_contents()

logger.debug(f"Data file validation completed: {len(issues)} total issues from {len(hed_files)} files")
return issues

def _make_dir_dict(self, root_path):
Expand Down Expand Up @@ -254,7 +293,14 @@ def _make_sidecar_dict(self, json_files):

@staticmethod
def create_file_group(root_path, file_list, suffix):
logger = logging.getLogger('hed.bids_file_group')
logger.debug(f"Creating file group for suffix '{suffix}' from {len(file_list)} files")

file_group = BidsFileGroup(root_path, file_list, suffix=suffix)

if not file_group.sidecar_dict and not file_group.datafile_dict:
logger.debug(f"File group '{suffix}' is empty (no sidecars or data files), returning None")
return None

logger.debug(f"File group '{suffix}' created successfully")
return file_group
Loading
Loading