diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 62c02fb5c..8756ddd09 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -280,6 +280,40 @@ def _populate_coordinates( return dataset, drop_vars_delayed +def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None: + text_header = segy_file.text_header.splitlines() + # Validate: + # text_header this should be a 40-items array of strings with width of 80 characters. + item_count = 40 + if len(text_header) != item_count: + err = f"Invalid text header count: expected {item_count}, got {len(text_header)}" + raise ValueError(err) + char_count = 80 + for i, line in enumerate(text_header): + if len(line) != char_count: + err = f"Invalid text header {i} line length: expected {char_count}, got {len(line)}" + raise ValueError(err) + ext_text_header = segy_file.ext_text_header + + # If using SegyFile.ext_text_header this should be a minimum of 40 elements and must + # capture all textual information (ensure text_header is a subset of ext_text_header). + if ext_text_header is not None: + for ext_hdr in ext_text_header: + text_header.append(ext_hdr.splitlines()) + + # Handle case where it may not have any metadata yet + if dataset.metadata.attributes is None: + dataset.attrs["attributes"] = {} + + # Update the attributes with the text and binary headers. + dataset.metadata.attributes.update( + { + "textHeader": text_header, + "binaryHeader": segy_file.binary_header.to_dict(), + } + ) + + def segy_to_mdio( segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate, @@ -324,6 +358,8 @@ def segy_to_mdio( name=mdio_template.name, sizes=shape, horizontal_coord_unit=horizontal_unit, headers=headers ) + _add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file) + xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) xr_dataset, drop_vars_delayed = _populate_coordinates( diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py index 40e10b489..e80028df1 100644 --- a/tests/integration/test_segy_import_export.py +++ b/tests/integration/test_segy_import_export.py @@ -13,6 +13,8 @@ import xarray as xr from segy import SegyFile from segy.standards import get_segy_standard +from tests.integration.testing_data import binary_header_teapot_dome +from tests.integration.testing_data import text_header_teapot_dome from tests.integration.testing_helpers import customize_segy_specs from tests.integration.testing_helpers import get_inline_header_values from tests.integration.testing_helpers import get_values @@ -266,8 +268,8 @@ def test_3d_import( segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PostStack3DTime"), - input_location=StorageLocation(segy_input.__str__()), - output_location=StorageLocation(zarr_tmp.__str__()), + input_location=StorageLocation(str(segy_input)), + output_location=StorageLocation(str(zarr_tmp)), overwrite=True, ) @@ -278,11 +280,9 @@ class TestReader: def test_meta_dataset_read(self, zarr_tmp: Path) -> None: """Metadata reading tests.""" - path = zarr_tmp.__str__() - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) expected_attrs = { "apiVersion": "1.0.0a1", "createdOn": "2025-08-06 16:21:54.747880+00:00", @@ -297,13 +297,25 @@ def test_meta_dataset_read(self, zarr_tmp: Path) -> None: else: assert actual_attrs_json[key] == value + attributes = ds.attrs["attributes"] + assert attributes is not None + + # Validate attributes provided by the template + assert attributes["surveyDimensionality"] == "3D" + assert attributes["ensembleType"] == "line" + assert attributes["processingStage"] == "post-stack" + + # Validate text header + assert attributes["textHeader"] == text_header_teapot_dome() + + # Validate binary header + assert attributes["binaryHeader"] == binary_header_teapot_dome() + def test_meta_variable_read(self, zarr_tmp: Path) -> None: """Metadata reading tests.""" - path = zarr_tmp.__str__() # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) expected_attrs = { "count": 97354860, "sum": -8594.551666259766, @@ -318,11 +330,9 @@ def test_meta_variable_read(self, zarr_tmp: Path) -> None: def test_grid(self, zarr_tmp: Path) -> None: """Test validating MDIO variables.""" # Load Xarray dataset from the MDIO file - path = zarr_tmp.__str__() - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) # Note: in order to create the dataset we used the Time template, so the # sample dimension is called "time" @@ -366,22 +376,18 @@ def test_grid(self, zarr_tmp: Path) -> None: def test_inline(self, zarr_tmp: Path) -> None: """Read and compare every 75 inlines' mean and std. dev.""" - path = zarr_tmp.__str__() - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) inlines = ds["amplitude"][::75, :, :] mean, std = inlines.mean(), inlines.std() npt.assert_allclose([mean, std], [1.0555277e-04, 6.0027051e-01]) def test_crossline(self, zarr_tmp: Path) -> None: """Read and compare every 75 crosslines' mean and std. dev.""" - path = zarr_tmp.__str__() - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) xlines = ds["amplitude"][:, ::75, :] mean, std = xlines.mean(), xlines.std() @@ -389,11 +395,9 @@ def test_crossline(self, zarr_tmp: Path) -> None: def test_zslice(self, zarr_tmp: Path) -> None: """Read and compare every 225 z-slices' mean and std. dev.""" - path = zarr_tmp.__str__() - # path = "/tmp/pytest-of-vscode/my-mdio/mdio0" # NOTE: If mask_and_scale is not set, # Xarray will convert int to float and replace _FillValue with NaN - ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False) + ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False) slices = ds["amplitude"][:, :, ::225] mean, std = slices.mean(), slices.std() npt.assert_allclose([mean, std], [0.005236923, 0.61279935]) diff --git a/tests/integration/testing_data.py b/tests/integration/testing_data.py new file mode 100644 index 000000000..3696a7cf1 --- /dev/null +++ b/tests/integration/testing_data.py @@ -0,0 +1,84 @@ +"""Integration tests data for teapot dome SEG-Y.""" + + +def text_header_teapot_dome() -> list[str]: + """Return the teapot dome expected text header.""" + return [ + "C 1 CLIENT: ROCKY MOUNTAIN OILFIELD TESTING CENTER ", + "C 2 PROJECT: NAVAL PETROLEUM RESERVE #3 (TEAPOT DOME); NATRONA COUNTY, WYOMING ", + "C 3 LINE: 3D ", + "C 4 ", + "C 5 THIS IS THE FILTERED POST STACK MIGRATION ", + "C 6 ", + "C 7 INLINE 1, XLINE 1: X COORDINATE: 788937 Y COORDINATE: 938845 ", + "C 8 INLINE 1, XLINE 188: X COORDINATE: 809501 Y COORDINATE: 939333 ", + "C 9 INLINE 188, XLINE 1: X COORDINATE: 788039 Y COORDINATE: 976674 ", + "C10 INLINE NUMBER: MIN: 1 MAX: 345 TOTAL: 345 ", + "C11 CROSSLINE NUMBER: MIN: 1 MAX: 188 TOTAL: 188 ", + "C12 TOTAL NUMBER OF CDPS: 64860 BIN DIMENSION: 110' X 110' ", + "C13 ", + "C14 ", + "C15 ", + "C16 ", + "C17 ", + "C18 ", + "C19 GENERAL SEGY INFORMATION ", + "C20 RECORD LENGHT (MS): 3000 ", + "C21 SAMPLE RATE (MS): 2.0 ", + "C22 DATA FORMAT: 4 BYTE IBM FLOATING POINT ", + "C23 BYTES 13- 16: CROSSLINE NUMBER (TRACE) ", + "C24 BYTES 17- 20: INLINE NUMBER (LINE) ", + "C25 BYTES 81- 84: CDP_X COORD ", + "C26 BYTES 85- 88: CDP_Y COORD ", + "C27 BYTES 181-184: INLINE NUMBER (LINE) ", + "C28 BYTES 185-188: CROSSLINE NUMBER (TRACE) ", + "C29 BYTES 189-192: CDP_X COORD ", + "C30 BYTES 193-196: CDP_Y COORD ", + "C31 ", + "C32 ", + "C33 ", + "C34 ", + "C35 ", + "C36 Processed by: Excel Geophysical Services, Inc. ", + "C37 8301 East Prentice Ave. Ste. 402 ", + "C38 Englewood, Colorado 80111 ", + "C39 (voice) 303.694.9629 (fax) 303.771.1646 ", + "C40 END EBCDIC ", + ] + + +def binary_header_teapot_dome() -> dict[str, int]: + """Return the teapot dome expected binary header.""" + return { + "job_id": 9999, + "line_num": 9999, + "reel_num": 1, + "data_traces_per_ensemble": 188, + "aux_traces_per_ensemble": 0, + "sample_interval": 2000, + "orig_sample_interval": 0, + "samples_per_trace": 1501, + "orig_samples_per_trace": 1501, + "data_sample_format": 1, + "ensemble_fold": 57, + "trace_sorting_code": 4, + "vertical_sum_code": 1, + "sweep_freq_start": 0, + "sweep_freq_end": 0, + "sweep_length": 0, + "sweep_type_code": 0, + "sweep_trace_num": 0, + "sweep_taper_start": 0, + "sweep_taper_end": 0, + "taper_type_code": 0, + "correlated_data_code": 2, + "binary_gain_code": 1, + "amp_recovery_code": 4, + "measurement_system_code": 2, + "impulse_polarity_code": 1, + "vibratory_polarity_code": 0, + "fixed_length_trace_flag": 0, + "num_extended_text_headers": 0, + "segy_revision_major": 0, + "segy_revision_minor": 0, + }