-
Notifications
You must be signed in to change notification settings - Fork 16
Labels
Description
Overview
To be used with the template registry in #563.
We need to define the pre-defined MDIO dataset types to be used in seismic dataset ingestion from SEG-Y. The MDIO v1 schema allows us to define this clearly in a type-safe manner. Initial (probably bad!) implementation with limited types are here:
mdio-python/src/mdio/factory.py
Lines 58 to 493 in f8d7607
| class MDIOSchemaType(Enum): | |
| """MDIO templates for specific data types.""" | |
| SEISMIC_3D_POST_STACK_TIME = auto() | |
| SEISMIC_3D_POST_STACK_DEPTH = auto() | |
| SEISMIC_3D_PRE_STACK_CDP_TIME = auto() | |
| SEISMIC_3D_PRE_STACK_CDP_TIME_IRREGULAR = auto() | |
| SEISMIC_3D_PRE_STACK_CDP_DEPTH = auto() | |
| SEISMIC_3D_PRE_STACK_CDP_DEPTH_IRREGULAR = auto() | |
| SEISMIC_3D_STREAMER_SHOT = auto() | |
| SEISMIC_2D_POST_STACK_TIME = auto() | |
| SEISMIC_2D_POST_STACK_DEPTH = auto() | |
| SEISMIC_2D_PRE_STACK_CDP_TIME = auto() | |
| SEISMIC_2D_PRE_STACK_CDP_DEPTH = auto() | |
| SEISMIC_2D_STREAMER_SHOT = auto() | |
| WIND_WRF = auto() | |
| def get_approx_chunks( | |
| shape: list[int], | |
| dtype: DTypeLike, | |
| limit: str = "4M", | |
| ) -> tuple[int, ...]: | |
| """Get approximate chunk sizes to fit within limit with shape aspect ratio.""" | |
| n_dim = len(shape) | |
| return auto_chunks( | |
| chunks=("auto",) * n_dim, | |
| shape=shape, | |
| limit=limit, | |
| dtype=np.dtype(dtype), | |
| previous_chunks=shape, | |
| ) | |
| class AbstractSeismic(ABC): | |
| """Abstract class for specific seismic schemas.""" | |
| _trace_domain: str = "unknown" | |
| _sample_format: str = "float32" | |
| _dim_names: list[str] = [] | |
| _chunks: list[int] = [] | |
| _coords: dict[str, tuple[str, tuple[int, ...], dict[str, str]]] | |
| _dataset_attrs: dict[str, str] | |
| _sample_compressor: dict[str, Any] | None = {"name": "blosc", "algorithm": "zstd"} | |
| _meta_compressor: dict[str, Any] | None = {"name": "blosc"} | |
| @classmethod | |
| def create_dimension_coords( | |
| cls: type[AbstractSeismic], | |
| shape: list[int], | |
| z_units: dict[str, str], | |
| ) -> list[Variable]: | |
| """Create schema for dimension coordinates.""" | |
| dim_coords = [] | |
| for dim_name, dim_size in zip(cls._dim_names, shape): | |
| dim_builder = VariableBuilder() | |
| dim_builder.set_name(dim_name) | |
| dim_builder.set_format("uint16") | |
| dim_builder.add_dimension({dim_name: dim_size}) | |
| dim_builder.set_compressor(cls._meta_compressor) | |
| if dim_name in cls._trace_domain: | |
| dim_builder.set_units(z_units) | |
| dim_coord = dim_builder.build() | |
| dim_coords.append(dim_coord) | |
| return dim_coords | |
| @classmethod | |
| def create_seismic_variables( # noqa: PLR0913 | |
| cls: type[AbstractSeismic], | |
| sample_format: ScalarType, | |
| header_fields: dict[str, Any], | |
| shape: list[int], | |
| chunks: list[int], | |
| sample_units: dict[str, str], | |
| coord_names: list[str], | |
| ) -> tuple[Variable, Variable, Variable]: | |
| """Build seismic variables based on user input.""" | |
| mask_chunks = list(get_approx_chunks(shape[:-1], "bool", limit="8M")) | |
| mask_builder = VariableBuilder() | |
| mask_builder.set_name("trace_mask") | |
| mask_builder.set_format("bool") | |
| mask_builder.set_chunks(mask_chunks) | |
| mask_builder.add_dimension(*cls._dim_names[:-1]) | |
| mask_builder.set_compressor(cls._meta_compressor) | |
| sample_builder = VariableBuilder() | |
| sample_builder.set_name("seismic") | |
| sample_builder.set_format(sample_format) | |
| sample_builder.set_chunks(chunks) | |
| sample_builder.set_units(sample_units) | |
| sample_builder.add_dimension(*cls._dim_names) | |
| sample_builder.set_compressor(cls._sample_compressor) | |
| sample_builder.add_coordinate("trace_mask") | |
| header_builder = VariableBuilder() | |
| header_builder.set_name("headers") | |
| header_builder.set_format(header_fields) | |
| header_builder.set_chunks(chunks[:-1]) | |
| header_builder.add_dimension(*cls._dim_names[:-1]) | |
| header_builder.set_compressor(cls._meta_compressor) | |
| header_builder.add_coordinate("trace_mask") | |
| if coord_names is not None: | |
| mask_builder.add_coordinate(*coord_names) | |
| sample_builder.add_coordinate(*coord_names) | |
| header_builder.add_coordinate(*coord_names) | |
| trace_mask = mask_builder.build() | |
| samples = sample_builder.build() | |
| headers = header_builder.build() | |
| return trace_mask, samples, headers | |
| @classmethod | |
| def create_seismic_coordinates( | |
| cls: type[AbstractSeismic], | |
| coords_dict: dict[str, tuple[str, dict[str, str], list[str]]], | |
| shape: list[int], | |
| ) -> list[Variable]: | |
| """Build seismic coordinates based on user input.""" | |
| coord_vars = [] | |
| for name, (format_, unit, coord_dims) in coords_dict.items(): | |
| dim_indices = [cls._dim_names.index(dim) for dim in coord_dims] | |
| coord_shape = [shape[idx] for idx in dim_indices] | |
| coord_chunks = list(get_approx_chunks(coord_shape, format_, limit="8M")) | |
| coord_builder = VariableBuilder() | |
| coord_builder.set_name(name) | |
| coord_builder.set_format(format_) | |
| coord_builder.set_chunks(coord_chunks) | |
| coord_builder.set_units(unit) | |
| coord_builder.add_dimension(*coord_dims) | |
| coord_builder.set_compressor(cls._meta_compressor) | |
| coord_vars.append(coord_builder.build()) | |
| return coord_vars | |
| @classmethod | |
| def create( # noqa: PLR0913 | |
| cls: type[AbstractSeismic], | |
| name: str, | |
| shape: list[int], | |
| header_fields: dict[str, str], | |
| create_coords: bool = False, | |
| sample_format: str | None = None, | |
| chunks: list[int] | None = None, | |
| sample_units: dict[str, str] | None = None, | |
| z_units: dict[str, str] | None = None, | |
| ) -> Dataset: | |
| """Create a seismic dataset schema based on user input.""" | |
| chunks = chunks or cls._chunks | |
| sample_format = sample_format or cls._sample_format | |
| n_dim = len(cls._dim_names) | |
| if len(shape) != n_dim: | |
| msg = f"Shape must be {n_dim} dimensional but got {shape}." | |
| raise ValueError(msg) | |
| if len(chunks) != n_dim: | |
| msg = f"Chunks must be {n_dim} dimensional but got {chunks}." | |
| raise ValueError(msg) | |
| dim_coords = cls.create_dimension_coords(shape, z_units) | |
| dataset_vars = dim_coords | |
| coord_names = None | |
| if create_coords: | |
| coord_names = list(cls._coords.keys()) | |
| coord_vars = cls.create_seismic_coordinates(cls._coords, shape) | |
| dataset_vars += coord_vars | |
| trace_mask, samples, headers = cls.create_seismic_variables( | |
| sample_format, | |
| header_fields, | |
| shape, | |
| chunks, | |
| sample_units, | |
| coord_names, | |
| ) | |
| dataset_vars += [trace_mask, samples, headers] | |
| dataset_meta = DatasetMetadata( | |
| name=name, | |
| created_on=datetime.now(UTC).isoformat(), | |
| api_version=MDIO_VERSION, | |
| ) | |
| dataset_meta.attributes = cls._dataset_attrs | |
| dataset_builder = DatasetBuilder() | |
| dataset_builder.set_name(name) | |
| return Dataset(variables=dataset_vars, metadata=dataset_meta) | |
| class Seismic3DPostStackTime(AbstractSeismic): | |
| """3D seismic post stack in time domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "line", | |
| "processingStage": "post-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["inline", "crossline", _trace_domain] | |
| _chunks = [128, 128, 128] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic3DPostStackDepth(AbstractSeismic): | |
| """3D seismic post stack in depth domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "line", | |
| "processingStage": "post-stack", | |
| } | |
| _trace_domain = "depth" | |
| _dim_names = ["inline", "crossline", _trace_domain] | |
| _chunks = [128, 128, 128] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic3DPreStackCdpTime(AbstractSeismic): | |
| """3D seismic CDP gathers in time domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["inline", "crossline", "offset", _trace_domain] | |
| _chunks = [1, 1, 512, 4096] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| } | |
| class Seismic3DPreStackCdpTimeIrregular(AbstractSeismic): | |
| """3D seismic CDP gathers in time domain with non-regularized offsets.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["inline", "crossline", "trace", _trace_domain] | |
| _chunks = [1, 1, 512, 4096] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "offset": ("float32", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic3DPreStackCdpDepth(AbstractSeismic): | |
| """3D seismic CDP gathers in depth domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "depth" | |
| _dim_names = ["inline", "crossline", "offset", _trace_domain] | |
| _chunks = [1, 1, 512, 4096] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| } | |
| class Seismic3DPreStackCdpDepthIrregular(AbstractSeismic): | |
| """3D seismic CDP gathers in depth domain with non-regularized offsets.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "depth" | |
| _dim_names = ["inline", "crossline", "trace", _trace_domain] | |
| _chunks = [1, 1, 512, 4096] # 8 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "offset": ("float32", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic3DStreamerShot(AbstractSeismic): | |
| """3D seismic shot gathers for streamer acquisition.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "3D", | |
| "ensembleType": "shot", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["shot_point", "cable", "channel", _trace_domain] | |
| _chunks = [1, 1, 128, 4096] # 2 mb | |
| _coords = { | |
| "gun": ("uint8", None, _dim_names[:-3]), | |
| "shot-x": ("float64", {"length": "m"}, _dim_names[:-3]), | |
| "shot-y": ("float64", {"length": "m"}, _dim_names[:-3]), | |
| "receiver-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "receiver-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic2DPostStackTime(AbstractSeismic): | |
| """2D seismic post stack in time domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "2D", | |
| "ensembleType": "line", | |
| "processingStage": "post-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["cdp", _trace_domain] | |
| _chunks = [512, 2048] # 4 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic2DPostStackDepth(AbstractSeismic): | |
| """2D seismic post stack in depth domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "2D", | |
| "ensembleType": "line", | |
| "processingStage": "post-stack", | |
| } | |
| _trace_domain = "depth" | |
| _dim_names = ["cdp", _trace_domain] | |
| _chunks = [512, 2048] # 4 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| class Seismic2DPreStackCdpTime(AbstractSeismic): | |
| """2D seismic CDP gathers in time domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "2D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["cdp", "offset", _trace_domain] | |
| _chunks = [1, 512, 2048] # 4 mb | |
| _coords = { | |
| "cdp-x": ("float32", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float32", {"length": "m"}, _dim_names[:-2]), | |
| } | |
| class Seismic2DPreStackCdpDepth(AbstractSeismic): | |
| """2D seismic CDP gathers in depth domain.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "2D", | |
| "ensembleType": "cdp", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "depth" | |
| _dim_names = ["cdp", "offset", _trace_domain] | |
| _chunks = [1, 512, 2048] # 4 mb | |
| _coords = { | |
| "cdp-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "cdp-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| } | |
| class Seismic2DStreamerShot(AbstractSeismic): | |
| """2D seismic shot gathers for streamer acquisition.""" | |
| _dataset_attrs = { | |
| "surveyDimensionality": "2D", | |
| "ensembleType": "shot", | |
| "processingStage": "pre-stack", | |
| } | |
| _trace_domain = "time" | |
| _dim_names = ["shot_point", "channel", _trace_domain] | |
| _chunks = [1, 128, 4096] | |
| _coords = { | |
| "gun": ("uint8", None, _dim_names[:-2]), | |
| "shot-x": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "shot-y": ("float64", {"length": "m"}, _dim_names[:-2]), | |
| "receiver-x": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| "receiver-y": ("float64", {"length": "m"}, _dim_names[:-1]), | |
| } | |
| SCHEMA_TEMPLATE_MAP = { | |
| # 3D Seismic Post Stack | |
| MDIOSchemaType.SEISMIC_3D_POST_STACK_TIME: Seismic3DPostStackTime, | |
| MDIOSchemaType.SEISMIC_3D_POST_STACK_DEPTH: Seismic3DPostStackDepth, | |
| # 3D Seismic Pre-Stack | |
| MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_TIME: Seismic3DPreStackCdpTime, | |
| MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_TIME_IRREGULAR: Seismic3DPreStackCdpTimeIrregular, | |
| MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_DEPTH: Seismic3DPreStackCdpDepth, | |
| MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_DEPTH_IRREGULAR: Seismic3DPreStackCdpDepthIrregular, | |
| # 3D Seismic Shot | |
| MDIOSchemaType.SEISMIC_3D_STREAMER_SHOT: Seismic3DStreamerShot, | |
| # 2D Seismic Post Stack | |
| MDIOSchemaType.SEISMIC_2D_POST_STACK_TIME: Seismic2DPostStackTime, | |
| MDIOSchemaType.SEISMIC_2D_POST_STACK_DEPTH: Seismic2DPostStackDepth, | |
| # 2D Seismic Pre-Stack | |
| MDIOSchemaType.SEISMIC_2D_PRE_STACK_CDP_TIME: Seismic2DPostStackTime, | |
| MDIOSchemaType.SEISMIC_2D_PRE_STACK_CDP_DEPTH: Seismic2DPostStackDepth, | |
| # 2D Seismic Shot | |
| MDIOSchemaType.SEISMIC_2D_STREAMER_SHOT: Seismic2DStreamerShot, | |
| } |
Definition of Done
- Design a way to define templates concisely without boilerplate (e.g. original implementation but better)
- Implement the template definition logic
- Write out pre-defined templates for least 3D post-stack types
- Register the pre-defined templates with registry pattern from MDIO Schema Template Registry #563
- Support for attaching/enabling/disabling grid overrides
Requirements
The template basically defines the mapping from arbitrary SEG-Y data to structured MDIO datasets. It must define some key requirements:
- Templates must define key dimension names.
- Variables must define their dimensions, coordinates, chunk sizes, and compression etc.
- Each template must define its required keys for dimensions and coordinates.
- Templates must define chunk sizes for all variables to be created.
- Templates must define grid overrides to be applied during ingestion.
The following templates are required. Some of them will also have depth/time variants.
Post-Stack types
- PostStack3D
chunks: (128, 128, 128) - PostStack2D
chunks: (1024, 1024)
Pre-Stack types
- PreStackCdpGathers3D
- PreStackShotGathers3D
- PreStackCdpGathers2D
- PreStackShotGathers2D
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
Done