Skip to content

Concrete Implementation of MDIO Seismic Templates #564

@tasansal

Description

@tasansal

Overview

To be used with the template registry in #563.

We need to define the pre-defined MDIO dataset types to be used in seismic dataset ingestion from SEG-Y. The MDIO v1 schema allows us to define this clearly in a type-safe manner. Initial (probably bad!) implementation with limited types are here:

class MDIOSchemaType(Enum):
"""MDIO templates for specific data types."""
SEISMIC_3D_POST_STACK_TIME = auto()
SEISMIC_3D_POST_STACK_DEPTH = auto()
SEISMIC_3D_PRE_STACK_CDP_TIME = auto()
SEISMIC_3D_PRE_STACK_CDP_TIME_IRREGULAR = auto()
SEISMIC_3D_PRE_STACK_CDP_DEPTH = auto()
SEISMIC_3D_PRE_STACK_CDP_DEPTH_IRREGULAR = auto()
SEISMIC_3D_STREAMER_SHOT = auto()
SEISMIC_2D_POST_STACK_TIME = auto()
SEISMIC_2D_POST_STACK_DEPTH = auto()
SEISMIC_2D_PRE_STACK_CDP_TIME = auto()
SEISMIC_2D_PRE_STACK_CDP_DEPTH = auto()
SEISMIC_2D_STREAMER_SHOT = auto()
WIND_WRF = auto()
def get_approx_chunks(
shape: list[int],
dtype: DTypeLike,
limit: str = "4M",
) -> tuple[int, ...]:
"""Get approximate chunk sizes to fit within limit with shape aspect ratio."""
n_dim = len(shape)
return auto_chunks(
chunks=("auto",) * n_dim,
shape=shape,
limit=limit,
dtype=np.dtype(dtype),
previous_chunks=shape,
)
class AbstractSeismic(ABC):
"""Abstract class for specific seismic schemas."""
_trace_domain: str = "unknown"
_sample_format: str = "float32"
_dim_names: list[str] = []
_chunks: list[int] = []
_coords: dict[str, tuple[str, tuple[int, ...], dict[str, str]]]
_dataset_attrs: dict[str, str]
_sample_compressor: dict[str, Any] | None = {"name": "blosc", "algorithm": "zstd"}
_meta_compressor: dict[str, Any] | None = {"name": "blosc"}
@classmethod
def create_dimension_coords(
cls: type[AbstractSeismic],
shape: list[int],
z_units: dict[str, str],
) -> list[Variable]:
"""Create schema for dimension coordinates."""
dim_coords = []
for dim_name, dim_size in zip(cls._dim_names, shape):
dim_builder = VariableBuilder()
dim_builder.set_name(dim_name)
dim_builder.set_format("uint16")
dim_builder.add_dimension({dim_name: dim_size})
dim_builder.set_compressor(cls._meta_compressor)
if dim_name in cls._trace_domain:
dim_builder.set_units(z_units)
dim_coord = dim_builder.build()
dim_coords.append(dim_coord)
return dim_coords
@classmethod
def create_seismic_variables( # noqa: PLR0913
cls: type[AbstractSeismic],
sample_format: ScalarType,
header_fields: dict[str, Any],
shape: list[int],
chunks: list[int],
sample_units: dict[str, str],
coord_names: list[str],
) -> tuple[Variable, Variable, Variable]:
"""Build seismic variables based on user input."""
mask_chunks = list(get_approx_chunks(shape[:-1], "bool", limit="8M"))
mask_builder = VariableBuilder()
mask_builder.set_name("trace_mask")
mask_builder.set_format("bool")
mask_builder.set_chunks(mask_chunks)
mask_builder.add_dimension(*cls._dim_names[:-1])
mask_builder.set_compressor(cls._meta_compressor)
sample_builder = VariableBuilder()
sample_builder.set_name("seismic")
sample_builder.set_format(sample_format)
sample_builder.set_chunks(chunks)
sample_builder.set_units(sample_units)
sample_builder.add_dimension(*cls._dim_names)
sample_builder.set_compressor(cls._sample_compressor)
sample_builder.add_coordinate("trace_mask")
header_builder = VariableBuilder()
header_builder.set_name("headers")
header_builder.set_format(header_fields)
header_builder.set_chunks(chunks[:-1])
header_builder.add_dimension(*cls._dim_names[:-1])
header_builder.set_compressor(cls._meta_compressor)
header_builder.add_coordinate("trace_mask")
if coord_names is not None:
mask_builder.add_coordinate(*coord_names)
sample_builder.add_coordinate(*coord_names)
header_builder.add_coordinate(*coord_names)
trace_mask = mask_builder.build()
samples = sample_builder.build()
headers = header_builder.build()
return trace_mask, samples, headers
@classmethod
def create_seismic_coordinates(
cls: type[AbstractSeismic],
coords_dict: dict[str, tuple[str, dict[str, str], list[str]]],
shape: list[int],
) -> list[Variable]:
"""Build seismic coordinates based on user input."""
coord_vars = []
for name, (format_, unit, coord_dims) in coords_dict.items():
dim_indices = [cls._dim_names.index(dim) for dim in coord_dims]
coord_shape = [shape[idx] for idx in dim_indices]
coord_chunks = list(get_approx_chunks(coord_shape, format_, limit="8M"))
coord_builder = VariableBuilder()
coord_builder.set_name(name)
coord_builder.set_format(format_)
coord_builder.set_chunks(coord_chunks)
coord_builder.set_units(unit)
coord_builder.add_dimension(*coord_dims)
coord_builder.set_compressor(cls._meta_compressor)
coord_vars.append(coord_builder.build())
return coord_vars
@classmethod
def create( # noqa: PLR0913
cls: type[AbstractSeismic],
name: str,
shape: list[int],
header_fields: dict[str, str],
create_coords: bool = False,
sample_format: str | None = None,
chunks: list[int] | None = None,
sample_units: dict[str, str] | None = None,
z_units: dict[str, str] | None = None,
) -> Dataset:
"""Create a seismic dataset schema based on user input."""
chunks = chunks or cls._chunks
sample_format = sample_format or cls._sample_format
n_dim = len(cls._dim_names)
if len(shape) != n_dim:
msg = f"Shape must be {n_dim} dimensional but got {shape}."
raise ValueError(msg)
if len(chunks) != n_dim:
msg = f"Chunks must be {n_dim} dimensional but got {chunks}."
raise ValueError(msg)
dim_coords = cls.create_dimension_coords(shape, z_units)
dataset_vars = dim_coords
coord_names = None
if create_coords:
coord_names = list(cls._coords.keys())
coord_vars = cls.create_seismic_coordinates(cls._coords, shape)
dataset_vars += coord_vars
trace_mask, samples, headers = cls.create_seismic_variables(
sample_format,
header_fields,
shape,
chunks,
sample_units,
coord_names,
)
dataset_vars += [trace_mask, samples, headers]
dataset_meta = DatasetMetadata(
name=name,
created_on=datetime.now(UTC).isoformat(),
api_version=MDIO_VERSION,
)
dataset_meta.attributes = cls._dataset_attrs
dataset_builder = DatasetBuilder()
dataset_builder.set_name(name)
return Dataset(variables=dataset_vars, metadata=dataset_meta)
class Seismic3DPostStackTime(AbstractSeismic):
"""3D seismic post stack in time domain."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "line",
"processingStage": "post-stack",
}
_trace_domain = "time"
_dim_names = ["inline", "crossline", _trace_domain]
_chunks = [128, 128, 128] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
class Seismic3DPostStackDepth(AbstractSeismic):
"""3D seismic post stack in depth domain."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "line",
"processingStage": "post-stack",
}
_trace_domain = "depth"
_dim_names = ["inline", "crossline", _trace_domain]
_chunks = [128, 128, 128] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
class Seismic3DPreStackCdpTime(AbstractSeismic):
"""3D seismic CDP gathers in time domain."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "time"
_dim_names = ["inline", "crossline", "offset", _trace_domain]
_chunks = [1, 1, 512, 4096] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]),
}
class Seismic3DPreStackCdpTimeIrregular(AbstractSeismic):
"""3D seismic CDP gathers in time domain with non-regularized offsets."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "time"
_dim_names = ["inline", "crossline", "trace", _trace_domain]
_chunks = [1, 1, 512, 4096] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]),
"offset": ("float32", {"length": "m"}, _dim_names[:-1]),
}
class Seismic3DPreStackCdpDepth(AbstractSeismic):
"""3D seismic CDP gathers in depth domain."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "depth"
_dim_names = ["inline", "crossline", "offset", _trace_domain]
_chunks = [1, 1, 512, 4096] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]),
}
class Seismic3DPreStackCdpDepthIrregular(AbstractSeismic):
"""3D seismic CDP gathers in depth domain with non-regularized offsets."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "depth"
_dim_names = ["inline", "crossline", "trace", _trace_domain]
_chunks = [1, 1, 512, 4096] # 8 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]),
"offset": ("float32", {"length": "m"}, _dim_names[:-1]),
}
class Seismic3DStreamerShot(AbstractSeismic):
"""3D seismic shot gathers for streamer acquisition."""
_dataset_attrs = {
"surveyDimensionality": "3D",
"ensembleType": "shot",
"processingStage": "pre-stack",
}
_trace_domain = "time"
_dim_names = ["shot_point", "cable", "channel", _trace_domain]
_chunks = [1, 1, 128, 4096] # 2 mb
_coords = {
"gun": ("uint8", None, _dim_names[:-3]),
"shot-x": ("float64", {"length": "m"}, _dim_names[:-3]),
"shot-y": ("float64", {"length": "m"}, _dim_names[:-3]),
"receiver-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"receiver-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
class Seismic2DPostStackTime(AbstractSeismic):
"""2D seismic post stack in time domain."""
_dataset_attrs = {
"surveyDimensionality": "2D",
"ensembleType": "line",
"processingStage": "post-stack",
}
_trace_domain = "time"
_dim_names = ["cdp", _trace_domain]
_chunks = [512, 2048] # 4 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
class Seismic2DPostStackDepth(AbstractSeismic):
"""2D seismic post stack in depth domain."""
_dataset_attrs = {
"surveyDimensionality": "2D",
"ensembleType": "line",
"processingStage": "post-stack",
}
_trace_domain = "depth"
_dim_names = ["cdp", _trace_domain]
_chunks = [512, 2048] # 4 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
class Seismic2DPreStackCdpTime(AbstractSeismic):
"""2D seismic CDP gathers in time domain."""
_dataset_attrs = {
"surveyDimensionality": "2D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "time"
_dim_names = ["cdp", "offset", _trace_domain]
_chunks = [1, 512, 2048] # 4 mb
_coords = {
"cdp-x": ("float32", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float32", {"length": "m"}, _dim_names[:-2]),
}
class Seismic2DPreStackCdpDepth(AbstractSeismic):
"""2D seismic CDP gathers in depth domain."""
_dataset_attrs = {
"surveyDimensionality": "2D",
"ensembleType": "cdp",
"processingStage": "pre-stack",
}
_trace_domain = "depth"
_dim_names = ["cdp", "offset", _trace_domain]
_chunks = [1, 512, 2048] # 4 mb
_coords = {
"cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]),
}
class Seismic2DStreamerShot(AbstractSeismic):
"""2D seismic shot gathers for streamer acquisition."""
_dataset_attrs = {
"surveyDimensionality": "2D",
"ensembleType": "shot",
"processingStage": "pre-stack",
}
_trace_domain = "time"
_dim_names = ["shot_point", "channel", _trace_domain]
_chunks = [1, 128, 4096]
_coords = {
"gun": ("uint8", None, _dim_names[:-2]),
"shot-x": ("float64", {"length": "m"}, _dim_names[:-2]),
"shot-y": ("float64", {"length": "m"}, _dim_names[:-2]),
"receiver-x": ("float64", {"length": "m"}, _dim_names[:-1]),
"receiver-y": ("float64", {"length": "m"}, _dim_names[:-1]),
}
SCHEMA_TEMPLATE_MAP = {
# 3D Seismic Post Stack
MDIOSchemaType.SEISMIC_3D_POST_STACK_TIME: Seismic3DPostStackTime,
MDIOSchemaType.SEISMIC_3D_POST_STACK_DEPTH: Seismic3DPostStackDepth,
# 3D Seismic Pre-Stack
MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_TIME: Seismic3DPreStackCdpTime,
MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_TIME_IRREGULAR: Seismic3DPreStackCdpTimeIrregular,
MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_DEPTH: Seismic3DPreStackCdpDepth,
MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_DEPTH_IRREGULAR: Seismic3DPreStackCdpDepthIrregular,
# 3D Seismic Shot
MDIOSchemaType.SEISMIC_3D_STREAMER_SHOT: Seismic3DStreamerShot,
# 2D Seismic Post Stack
MDIOSchemaType.SEISMIC_2D_POST_STACK_TIME: Seismic2DPostStackTime,
MDIOSchemaType.SEISMIC_2D_POST_STACK_DEPTH: Seismic2DPostStackDepth,
# 2D Seismic Pre-Stack
MDIOSchemaType.SEISMIC_2D_PRE_STACK_CDP_TIME: Seismic2DPostStackTime,
MDIOSchemaType.SEISMIC_2D_PRE_STACK_CDP_DEPTH: Seismic2DPostStackDepth,
# 2D Seismic Shot
MDIOSchemaType.SEISMIC_2D_STREAMER_SHOT: Seismic2DStreamerShot,
}

Definition of Done

  1. Design a way to define templates concisely without boilerplate (e.g. original implementation but better)
  2. Implement the template definition logic
  3. Write out pre-defined templates for least 3D post-stack types
  4. Register the pre-defined templates with registry pattern from MDIO Schema Template Registry #563
  5. Support for attaching/enabling/disabling grid overrides

Requirements

The template basically defines the mapping from arbitrary SEG-Y data to structured MDIO datasets. It must define some key requirements:

  1. Templates must define key dimension names.
  2. Variables must define their dimensions, coordinates, chunk sizes, and compression etc.
  3. Each template must define its required keys for dimensions and coordinates.
  4. Templates must define chunk sizes for all variables to be created.
  5. Templates must define grid overrides to be applied during ingestion.

The following templates are required. Some of them will also have depth/time variants.

Post-Stack types

  1. PostStack3D chunks: (128, 128, 128)
  2. PostStack2D chunks: (1024, 1024)

Pre-Stack types

  1. PreStackCdpGathers3D
  2. PreStackShotGathers3D
  3. PreStackCdpGathers2D
  4. PreStackShotGathers2D

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    Status

    Done

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions