Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions src/mdio/converters/segy.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,40 @@ def _populate_coordinates(
return dataset, drop_vars_delayed


def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
text_header = segy_file.text_header.splitlines()
# Validate:
# text_header this should be a 40-items array of strings with width of 80 characters.
item_count = 40
if len(text_header) != item_count:
err = f"Invalid text header count: expected {item_count}, got {len(text_header)}"
raise ValueError(err)
char_count = 80
for i, line in enumerate(text_header):
if len(line) != char_count:
err = f"Invalid text header {i} line length: expected {char_count}, got {len(line)}"
raise ValueError(err)
ext_text_header = segy_file.ext_text_header

# If using SegyFile.ext_text_header this should be a minimum of 40 elements and must
# capture all textual information (ensure text_header is a subset of ext_text_header).
if ext_text_header is not None:
for ext_hdr in ext_text_header:
text_header.append(ext_hdr.splitlines())

# Handle case where it may not have any metadata yet
if dataset.metadata.attributes is None:
dataset.attrs["attributes"] = {}

# Update the attributes with the text and binary headers.
dataset.metadata.attributes.update(
{
"textHeader": text_header,
"binaryHeader": segy_file.binary_header.to_dict(),
}
)


def segy_to_mdio(
segy_spec: SegySpec,
mdio_template: AbstractDatasetTemplate,
Expand Down Expand Up @@ -324,6 +358,8 @@ def segy_to_mdio(
name=mdio_template.name, sizes=shape, horizontal_coord_unit=horizontal_unit, headers=headers
)

_add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)

xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)

xr_dataset, drop_vars_delayed = _populate_coordinates(
Expand Down
44 changes: 24 additions & 20 deletions tests/integration/test_segy_import_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import xarray as xr
from segy import SegyFile
from segy.standards import get_segy_standard
from tests.integration.testing_data import binary_header_teapot_dome
from tests.integration.testing_data import text_header_teapot_dome
from tests.integration.testing_helpers import customize_segy_specs
from tests.integration.testing_helpers import get_inline_header_values
from tests.integration.testing_helpers import get_values
Expand Down Expand Up @@ -266,8 +268,8 @@ def test_3d_import(
segy_to_mdio(
segy_spec=segy_spec,
mdio_template=TemplateRegistry().get("PostStack3DTime"),
input_location=StorageLocation(segy_input.__str__()),
output_location=StorageLocation(zarr_tmp.__str__()),
input_location=StorageLocation(str(segy_input)),
output_location=StorageLocation(str(zarr_tmp)),
overwrite=True,
)

Expand All @@ -278,11 +280,9 @@ class TestReader:

def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
"""Metadata reading tests."""
path = zarr_tmp.__str__()
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
expected_attrs = {
"apiVersion": "1.0.0a1",
"createdOn": "2025-08-06 16:21:54.747880+00:00",
Expand All @@ -297,13 +297,25 @@ def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
else:
assert actual_attrs_json[key] == value

attributes = ds.attrs["attributes"]
assert attributes is not None

# Validate attributes provided by the template
assert attributes["surveyDimensionality"] == "3D"
assert attributes["ensembleType"] == "line"
assert attributes["processingStage"] == "post-stack"

# Validate text header
assert attributes["textHeader"] == text_header_teapot_dome()

# Validate binary header
assert attributes["binaryHeader"] == binary_header_teapot_dome()

def test_meta_variable_read(self, zarr_tmp: Path) -> None:
"""Metadata reading tests."""
path = zarr_tmp.__str__()
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
expected_attrs = {
"count": 97354860,
"sum": -8594.551666259766,
Expand All @@ -318,11 +330,9 @@ def test_meta_variable_read(self, zarr_tmp: Path) -> None:
def test_grid(self, zarr_tmp: Path) -> None:
"""Test validating MDIO variables."""
# Load Xarray dataset from the MDIO file
path = zarr_tmp.__str__()
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)

# Note: in order to create the dataset we used the Time template, so the
# sample dimension is called "time"
Expand Down Expand Up @@ -366,34 +376,28 @@ def test_grid(self, zarr_tmp: Path) -> None:

def test_inline(self, zarr_tmp: Path) -> None:
"""Read and compare every 75 inlines' mean and std. dev."""
path = zarr_tmp.__str__()
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
inlines = ds["amplitude"][::75, :, :]
mean, std = inlines.mean(), inlines.std()
npt.assert_allclose([mean, std], [1.0555277e-04, 6.0027051e-01])

def test_crossline(self, zarr_tmp: Path) -> None:
"""Read and compare every 75 crosslines' mean and std. dev."""
path = zarr_tmp.__str__()
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
xlines = ds["amplitude"][:, ::75, :]
mean, std = xlines.mean(), xlines.std()

npt.assert_allclose([mean, std], [-5.0329847e-05, 5.9406823e-01])

def test_zslice(self, zarr_tmp: Path) -> None:
"""Read and compare every 225 z-slices' mean and std. dev."""
path = zarr_tmp.__str__()
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
# NOTE: If mask_and_scale is not set,
# Xarray will convert int to float and replace _FillValue with NaN
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
slices = ds["amplitude"][:, :, ::225]
mean, std = slices.mean(), slices.std()
npt.assert_allclose([mean, std], [0.005236923, 0.61279935])
Expand Down
84 changes: 84 additions & 0 deletions tests/integration/testing_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Integration tests data for teapot dome SEG-Y."""


def text_header_teapot_dome() -> list[str]:
"""Return the teapot dome expected text header."""
return [
"C 1 CLIENT: ROCKY MOUNTAIN OILFIELD TESTING CENTER ",
"C 2 PROJECT: NAVAL PETROLEUM RESERVE #3 (TEAPOT DOME); NATRONA COUNTY, WYOMING ",
"C 3 LINE: 3D ",
"C 4 ",
"C 5 THIS IS THE FILTERED POST STACK MIGRATION ",
"C 6 ",
"C 7 INLINE 1, XLINE 1: X COORDINATE: 788937 Y COORDINATE: 938845 ",
"C 8 INLINE 1, XLINE 188: X COORDINATE: 809501 Y COORDINATE: 939333 ",
"C 9 INLINE 188, XLINE 1: X COORDINATE: 788039 Y COORDINATE: 976674 ",
"C10 INLINE NUMBER: MIN: 1 MAX: 345 TOTAL: 345 ",
"C11 CROSSLINE NUMBER: MIN: 1 MAX: 188 TOTAL: 188 ",
"C12 TOTAL NUMBER OF CDPS: 64860 BIN DIMENSION: 110' X 110' ",
"C13 ",
"C14 ",
"C15 ",
"C16 ",
"C17 ",
"C18 ",
"C19 GENERAL SEGY INFORMATION ",
"C20 RECORD LENGHT (MS): 3000 ",
"C21 SAMPLE RATE (MS): 2.0 ",
"C22 DATA FORMAT: 4 BYTE IBM FLOATING POINT ",
"C23 BYTES 13- 16: CROSSLINE NUMBER (TRACE) ",
"C24 BYTES 17- 20: INLINE NUMBER (LINE) ",
"C25 BYTES 81- 84: CDP_X COORD ",
"C26 BYTES 85- 88: CDP_Y COORD ",
"C27 BYTES 181-184: INLINE NUMBER (LINE) ",
"C28 BYTES 185-188: CROSSLINE NUMBER (TRACE) ",
"C29 BYTES 189-192: CDP_X COORD ",
"C30 BYTES 193-196: CDP_Y COORD ",
"C31 ",
"C32 ",
"C33 ",
"C34 ",
"C35 ",
"C36 Processed by: Excel Geophysical Services, Inc. ",
"C37 8301 East Prentice Ave. Ste. 402 ",
"C38 Englewood, Colorado 80111 ",
"C39 (voice) 303.694.9629 (fax) 303.771.1646 ",
"C40 END EBCDIC ",
]


def binary_header_teapot_dome() -> dict[str, int]:
"""Return the teapot dome expected binary header."""
return {
"job_id": 9999,
"line_num": 9999,
"reel_num": 1,
"data_traces_per_ensemble": 188,
"aux_traces_per_ensemble": 0,
"sample_interval": 2000,
"orig_sample_interval": 0,
"samples_per_trace": 1501,
"orig_samples_per_trace": 1501,
"data_sample_format": 1,
"ensemble_fold": 57,
"trace_sorting_code": 4,
"vertical_sum_code": 1,
"sweep_freq_start": 0,
"sweep_freq_end": 0,
"sweep_length": 0,
"sweep_type_code": 0,
"sweep_trace_num": 0,
"sweep_taper_start": 0,
"sweep_taper_end": 0,
"taper_type_code": 0,
"correlated_data_code": 2,
"binary_gain_code": 1,
"amp_recovery_code": 4,
"measurement_system_code": 2,
"impulse_polarity_code": 1,
"vibratory_polarity_code": 0,
"fixed_length_trace_flag": 0,
"num_extended_text_headers": 0,
"segy_revision_major": 0,
"segy_revision_minor": 0,
}
Loading