From e21897fafdd038f3aa9e7d10c80ef9d362620a6a Mon Sep 17 00:00:00 2001 From: Mark Roberts Date: Tue, 2 Jun 2026 14:41:26 +0000 Subject: [PATCH 1/3] Add sfmdioread and sfmdiowrite --- framework/configure.py | 82 ++++++++ system/generic/Mmdioread.cc | 295 ++++++++++++++++++++++++++ system/generic/Mmdiowrite.cc | 371 +++++++++++++++++++++++++++++++++ system/generic/SConstruct | 54 +++++ system/generic/mdio2segy.cc | 393 +++++++++++++++++++++++++++++++++++ system/generic/mdio2segy.hh | 97 +++++++++ 6 files changed, 1292 insertions(+) create mode 100644 system/generic/Mmdioread.cc create mode 100644 system/generic/Mmdiowrite.cc create mode 100644 system/generic/mdio2segy.cc create mode 100644 system/generic/mdio2segy.hh diff --git a/framework/configure.py b/framework/configure.py index 3554e13702..7f40c4ee25 100644 --- a/framework/configure.py +++ b/framework/configure.py @@ -173,6 +173,7 @@ def check_all(context): psp(context) #FDNSI sparse(context) #FDNSI pfft(context) + mdio(context) # FDNSI def identify_platform(context): global plat @@ -1908,6 +1909,80 @@ def pfft(context): context.Result(context_failure) #need_pkg('pfft',fatal=False) +pkg['mdio'] = {'ubuntu':'mdio-cpp (build from https://github.com/TGSAI/mdio-cpp)'} + +def mdio(context): + 'Detect a prebuilt mdio-cpp installation (https://github.com/TGSAI/mdio-cpp)' + context.Message("checking for MDIO (mdio-cpp) ... ") + + mdiohome = context.env.get('MDIO_HOME', os.environ.get('MDIO_HOME')) + + cpppath = context.env.get('MDIO_CPPPATH', os.environ.get('MDIO_CPPPATH')) + libpath = context.env.get('MDIO_LIBPATH', os.environ.get('MDIO_LIBPATH')) + libs = context.env.get('MDIO_LIBS', os.environ.get('MDIO_LIBS')) + linkflags = context.env.get('MDIO_LINKFLAGS', os.environ.get('MDIO_LINKFLAGS')) + cxxflags = context.env.get('MDIO_CXXFLAGS', os.environ.get('MDIO_CXXFLAGS')) + + if cpppath and type(cpppath) is not list: + cpppath = cpppath.split(',') + if libpath and type(libpath) is not list: + libpath = libpath.split(',') + if libs and type(libs) is not list: + libs = libs.split(',') + + if not cpppath: + cpppath = [os.path.join(mdiohome,'include'),mdiohome] if mdiohome else [] + if not libpath: + libpath = [os.path.join(mdiohome,'lib'), + os.path.join(mdiohome,'lib64')] if mdiohome else [] + if not libs: + libs = ['mdio'] + + if not (mdiohome or cpppath): + context.Result(context_failure) + context.env['MDIO'] = None + return + + oldpath = path_get(context,'CPPPATH') + oldcxxflags = context.env.get('CXXFLAGS','') + + context.env['CPPPATH'] = oldpath + cpppath + # mdio-cpp requires C++17 and the compile definitions captured at build + # time (notably -DMAX_NUM_SLICES, without which mdio.h fails to compile). + if cxxflags: + context.env['CXXFLAGS'] = oldcxxflags + ' ' + cxxflags + elif '-std=' not in oldcxxflags: + context.env['CXXFLAGS'] = oldcxxflags + ' -std=c++17' + + text = ''' + #include + #include + int main(int argc,char* argv[]) { + std::string p("demo.mdio"); + auto ds = mdio::Dataset::Open(p, mdio::constants::kOpen); + (void) ds; + return 0; + }\n''' + res = context.TryCompile(text,'.cc') + + context.env['CPPPATH'] = oldpath + context.env['CXXFLAGS'] = oldcxxflags + + if res: + context.Result(res) + context.env['MDIO'] = True + context.env['MDIO_CPPPATH'] = cpppath + context.env['MDIO_LIBPATH'] = libpath + context.env['MDIO_LIBS'] = libs + if linkflags: + context.env['MDIO_LINKFLAGS'] = linkflags + if cxxflags: + context.env['MDIO_CXXFLAGS'] = cxxflags + else: + context.Result(context_failure) + context.env['MDIO'] = None + need_pkg('mdio', fatal=False) + def ncpus(): 'Detects number of CPUs' if plat['OS'] in ('linux','posix'): @@ -2681,5 +2756,12 @@ def options(file): opts.Add('NUMPY','Existence of numpy package') opts.Add('SWIG','Location of SWIG') opts.Add('PFFT','The PFFT library') + opts.Add('MDIO_HOME','Root of a prebuilt mdio-cpp installation') + opts.Add('MDIO','The mdio-cpp library (https://github.com/TGSAI/mdio-cpp)') + opts.Add('MDIO_CPPPATH','mdio-cpp - path(s) to headers') + opts.Add('MDIO_LIBPATH','mdio-cpp - path(s) to libraries') + opts.Add('MDIO_LIBS','mdio-cpp - libraries to link (mdio + internal deps)') + opts.Add('MDIO_LINKFLAGS','mdio-cpp - extra linker flags (e.g. archive group)') + opts.Add('MDIO_CXXFLAGS','mdio-cpp - extra C++ flags (std + compile defines)') return opts diff --git a/system/generic/Mmdioread.cc b/system/generic/Mmdioread.cc new file mode 100644 index 0000000000..54f219e332 --- /dev/null +++ b/system/generic/Mmdioread.cc @@ -0,0 +1,295 @@ +/* Convert an MDIO dataset to RSF. + +Reads the MDIO (Zarr) format through the mdio-cpp library and writes the +amplitudes to an RSF file, the per-trace SEG-Y headers to a separate tfile +(compatible with sfsegyread/sfsegywrite), and optionally the SEG-Y EBCDIC text +header (hfile) and 400-byte binary header (bfile) when they are present in the +dataset metadata. + +The input may be a local path, or a gs:// or s3:// URL (when mdio-cpp was built +with the corresponding cloud drivers). + +The data can be windowed on read with the same parameters as sfwindow +(n#, f#, j#, min#, max#). Axis 1 is the fast (sample) axis; the remaining +axes index the trace grid, in the order RSF stores them (n2 is the fastest +trace dimension). Because mdio-cpp slices with unit stride, a contiguous +bounding box is read lazily and any j#>1 decimation is applied afterwards. + +Trace headers are read either from one MDIO variable per SEG-Y key, or from a +single structured "headers" variable. +*/ + +#include +#include + +#include +#include + +#include "mdio2segy.hh" + +/* Decode a row-major (last axis fastest) linear index into a multi-index. */ +static void decode(long lin, int ndim, const long* shape, long* idx) +{ + for (int a = ndim - 1; a >= 0; a--) { + idx[a] = lin % shape[a]; + lin /= shape[a]; + } +} + +int main(int argc, char* argv[]) +{ + sf_init(argc, argv); + + bool verb; + if (!sf_getbool("verb", &verb)) verb = false; + /* Verbosity flag */ + + char* path = sf_getstring("mdio"); + /* input MDIO dataset (path or gs://, s3:// URL) */ + if (NULL == path) sf_error("Need mdio="); + + char* dataname = sf_getstring("data"); + /* name of the MDIO data variable (default: auto-detect) */ + char* hdrsname = sf_getstring("headers"); + /* name of the MDIO trace-headers variable (default: auto-detect) */ + + const char* read = sf_getstring("read"); + /* what to read: h - header, d - data, b - both (default) */ + if (NULL == read) read = "b"; + + /* ---- open dataset (lazy) ---- */ + auto dsFut = mdio::Dataset::Open(std::string(path), mdio::constants::kOpen); + if (!dsFut.status().ok()) sf_error("Cannot open MDIO \"%s\"", path); + mdio::Dataset ds = dsFut.value(); + + std::string datavar = mdio_data_variable(ds, dataname); + if (datavar.empty()) sf_error("Could not find a data variable in \"%s\"", path); + std::string headersVar = mdio_headers_variable(ds, hdrsname); + + if (verb) sf_warning("data variable: %s", datavar.c_str()); + + std::vector axes = mdio_axes(ds, datavar); + int rank = (int) axes.size(); + if (rank < 1) sf_error("Data variable \"%s\" has no dimensions", datavar.c_str()); + + /* ---- resolve windowing per axis (window.c convention) ---- */ + std::vector f(rank), m(rank), jp(rank), mc(rank); + for (int i = 1; i <= rank; i++) { /* RSF axis i -> MDIO axis a */ + int a = rank - i; + long n = axes[a].size; + double o = axes[a].o, d0 = axes[a].d; + char key[8]; + int iv; + float av; + off_t lv; + + /* jump j# (or sampling d#) */ + long j = 1; + snprintf(key, sizeof(key), "j%d", i); + if (sf_getint(key, &iv)) { + j = iv; + } else { + snprintf(key, sizeof(key), "d%d", i); + if (sf_getfloat(key, &av) && d0 != 0.) j = (long)(0.5 + av / d0); + } + if (j < 1) j = 1; + + /* start f# (or minimum min#) */ + long ff = 0; + snprintf(key, sizeof(key), "f%d", i); + if (sf_getlargeint(key, &lv)) { + ff = lv; + } else { + snprintf(key, sizeof(key), "min%d", i); + if (sf_getfloat(key, &av) && d0 != 0.) ff = (long)(0.5 + (av - o) / d0); + } + if (ff < 0) ff = n + ff; + if (ff < 0) sf_error("Negative f%d", i); + + double onew = o + ff * d0; + double dnew = d0 * j; + + /* count n# (or maximum max#) */ + long mm; + snprintf(key, sizeof(key), "n%d", i); + if (sf_getlargeint(key, &lv)) { + mm = lv; + } else { + snprintf(key, sizeof(key), "max%d", i); + if (sf_getfloat(key, &av) && dnew != 0.) + mm = (long)(1.5 + (av - onew) / dnew); + else + mm = 1 + (n - 1 - ff) / j; + } + if (mm < 1) mm = 1; + if (ff + (mm - 1) * j > n - 1) + sf_error("n%d=%ld is too big (axis %s, size %ld)", + i, (long) mm, axes[a].label.c_str(), n); + + f[a] = ff; + m[a] = mm; + jp[a] = j; + mc[a] = (mm - 1) * j + 1; + + axes[a].o = onew; /* carry windowed geometry to the output */ + axes[a].d = dnew; + } + + /* ---- contiguous slice over the bounding box ---- */ + std::vector > slices; + for (int a = 0; a < rank; a++) { + mdio::RangeDescriptor r = {axes[a].label.c_str(), + (mdio::Index) f[a], + (mdio::Index)(f[a] + mc[a]), + 1}; + slices.push_back(r); + } + + mdio::Dataset sliced = ds; + { + auto r = ds.isel(slices); + if (!r.status().ok()) sf_error("Failed to slice MDIO dataset"); + sliced = r.value(); + } + + /* sizes */ + int sa = rank - 1; /* MDIO sample axis */ + long ns = m[sa]; + long ntr = 1; + for (int a = 0; a < rank; a++) if (a != sa) ntr *= m[a]; + long total = ns * ntr; + + if (verb) sf_warning("Reading %ld samples x %ld traces", ns, ntr); + + /* ---- outputs ---- */ + sf_file out = NULL, hdr = NULL; + + if (read[0] != 'h') { /* data (and possibly headers) */ + out = sf_output("out"); + sf_settype(out, SF_FLOAT); + /* RSF axis i corresponds to MDIO axis rank-i */ + for (int i = 1; i <= rank; i++) { + int a = rank - i; + char key[8]; + snprintf(key, sizeof(key), "n%d", i); + sf_putint(out, key, (int) m[a]); + snprintf(key, sizeof(key), "d%d", i); + sf_putfloat(out, key, (float) axes[a].d); + snprintf(key, sizeof(key), "o%d", i); + sf_putfloat(out, key, (float) axes[a].o); + snprintf(key, sizeof(key), "label%d", i); + sf_putstring(out, key, + axes[a].label.empty() ? + (i == 1 ? "Time" : "Trace") : axes[a].label.c_str()); + if (!axes[a].unit.empty()) { + snprintf(key, sizeof(key), "unit%d", i); + sf_putstring(out, key, axes[a].unit.c_str()); + } + } + } + + if (read[0] != 'd') { /* headers */ + hdr = sf_output("tfile"); + sf_putint(hdr, "n1", SF_NKEYS); + sf_putint(hdr, "n2", (int) ntr); + sf_settype(hdr, SF_INT); + segy_init(SF_NKEYS, NULL); + segy2hist(hdr, SF_NKEYS); + + char* tname = sf_getstring("tfile"); + /* output trace header file */ + if (NULL != out) sf_putstring(out, "head", (NULL != tname) ? tname : "tfile"); + } else { + segy_init(SF_NKEYS, NULL); + } + + if (NULL != out) sf_fileflush(out, NULL); + + /* ---- text / binary SEG-Y headers ---- */ + char* hname = sf_getstring("hfile"); + /* output SEG-Y EBCDIC text header file */ + if (NULL != hname) { + char ahead[SF_EBCBYTES]; + if (mdio_get_text_header(ds, ahead)) { + FILE* fp = fopen(hname, "w"); + if (NULL == fp) sf_error("Cannot open hfile \"%s\"", hname); + fwrite(ahead, 1, SF_EBCBYTES, fp); + fclose(fp); + if (verb) sf_warning("Text header written to \"%s\"", hname); + } else if (verb) { + sf_warning("No text header in dataset; hfile not written"); + } + } + + char* bname = sf_getstring("bfile"); + /* output SEG-Y binary header file */ + if (NULL != bname) { + char bhead[SF_BNYBYTES]; + if (mdio_get_binary_header(ds, bhead)) { + FILE* fp = fopen(bname, "wb"); + if (NULL == fp) sf_error("Cannot open bfile \"%s\"", bname); + fwrite(bhead, 1, SF_BNYBYTES, fp); + fclose(fp); + if (verb) sf_warning("Binary header written to \"%s\"", bname); + } else if (verb) { + sf_warning("No binary header in dataset; bfile not written"); + } + } + + /* ---- data ---- */ + if (NULL != out) { + std::vector draw; + if (!mdio_read_field(sliced, datavar, draw)) + sf_error("Failed to read data variable \"%s\"", datavar.c_str()); + + float* obuf = sf_floatalloc(total); + std::vector idx(rank); + for (long t = 0; t < total; t++) { + decode(t, rank, &m[0], &idx[0]); + long s = 0; + for (int a = 0; a < rank; a++) s = s * mc[a] + idx[a] * jp[a]; + obuf[t] = (s < (long) draw.size()) ? (float) draw[s] : 0.0f; + } + sf_floatwrite(obuf, total, out); + free(obuf); + } + + /* ---- headers / tfile ---- */ + if (NULL != hdr) { + /* trace-grid geometry (MDIO axes except the sample axis) */ + int tr = rank - 1; + std::vector mt(tr), mct(tr), jt(tr); + for (int a = 0; a < tr; a++) { mt[a] = m[a]; mct[a] = mc[a]; jt[a] = jp[a]; } + + std::vector > keyvals(SF_NKEYS); + for (int k = 0; k < SF_NKEYS; k++) { + const char* nm = segykeyword(k); + std::vector vals; + if (!mdio_read_header_key(sliced, std::string(path), slices, + headersVar, nm, vals)) + continue; + + std::vector dec((size_t) ntr, 0); + std::vector idx(tr > 0 ? tr : 1); + for (long t = 0; t < ntr; t++) { + long s = 0; + if (tr > 0) { + decode(t, tr, &mt[0], &idx[0]); + for (int a = 0; a < tr; a++) s = s * mct[a] + idx[a] * jt[a]; + } + dec[(size_t) t] = (s < (long) vals.size()) ? (int) vals[s] : 0; + } + keyvals[k] = dec; + } + + int* itrace = sf_intalloc(SF_NKEYS); + for (long t = 0; t < ntr; t++) { + for (int k = 0; k < SF_NKEYS; k++) + itrace[k] = keyvals[k].empty() ? 0 : keyvals[k][(size_t) t]; + sf_intwrite(itrace, SF_NKEYS, hdr); + } + free(itrace); + } + + exit(0); +} diff --git a/system/generic/Mmdiowrite.cc b/system/generic/Mmdiowrite.cc new file mode 100644 index 0000000000..6769188569 --- /dev/null +++ b/system/generic/Mmdiowrite.cc @@ -0,0 +1,371 @@ +/* Convert an RSF dataset to MDIO. + +Writes an RSF amplitude file (and, optionally, its SEG-Y trace headers tfile, +EBCDIC text header hfile, and 400-byte binary header bfile) to the MDIO +(Zarr) format through the mdio-cpp library. + +The output may be a local path, or a gs:// or s3:// URL (when mdio-cpp was built +with the corresponding cloud drivers). + +A header-copy option (headers= / like= pointing at another MDIO dataset) +copies the text header, binary header, and/or per-trace headers from that +dataset instead of taking them from tfile/hfile/bfile. The hdrcopy= selector +(text, binary, trace, or all) controls what is copied. +*/ + +#include +#include + +#include +#include +#include +#include + +#include "mdio2segy.hh" + +/* Sanitize an RSF axis label into a valid, unique MDIO dimension name. */ +static std::string clean_name(const char* label, int rsfaxis, + std::vector& used) +{ + std::string s; + if (label) { + for (const char* p = label; *p; p++) + s += (isalnum((unsigned char) *p) || *p == '_') ? *p : '_'; + } + if (s.empty()) { char b[16]; snprintf(b, sizeof(b), "dim%d", rsfaxis); s = b; } + + std::string base = s; + int suffix = 1; + bool clash = true; + while (clash) { + clash = false; + for (size_t i = 0; i < used.size(); i++) + if (used[i] == s) { clash = true; break; } + if (clash) { char b[16]; snprintf(b, sizeof(b), "_%d", suffix++); s = base + b; } + } + used.push_back(s); + return s; +} + +static bool write_float_buf(mdio::Dataset& ds, const std::string& name, + const float* buf, long total) +{ + auto vr = ds.variables.get(name); + if (!vr.status().ok()) return false; + auto var = vr.value(); + auto vdr = mdio::from_variable(var); + if (!vdr.status().ok()) return false; + auto vd = vdr.value(); + long n = (long) vd.num_samples(); + auto off = vd.get_flattened_offset(); + float* p = static_cast(vd.get_data_accessor().data()); + for (long i = 0; i < n && i < total; i++) p[off + i] = buf[i]; + return var.Write(vd).status().ok(); +} + +static bool write_int_var(mdio::Dataset& ds, const std::string& name, + const std::vector& col) +{ + auto vr = ds.variables.get(name); + if (!vr.status().ok()) return false; + auto var = vr.value(); + auto vdr = mdio::from_variable(var); + if (!vdr.status().ok()) return false; + auto vd = vdr.value(); + long n = (long) vd.num_samples(); + auto off = vd.get_flattened_offset(); + mdio::dtypes::int32_t* p = + static_cast(vd.get_data_accessor().data()); + for (long i = 0; i < n && i < (long) col.size(); i++) + p[off + i] = (mdio::dtypes::int32_t) col[(size_t) i]; + return var.Write(vd).status().ok(); +} + +int main(int argc, char* argv[]) +{ + sf_init(argc, argv); + + bool verb; + if (!sf_getbool("verb", &verb)) verb = false; + /* Verbosity flag */ + + sf_file in = sf_input("in"); + if (SF_FLOAT != sf_gettype(in)) sf_error("Need float input"); + + char* path = sf_getstring("mdio"); + /* output MDIO dataset (path or gs://, s3:// URL) */ + if (NULL == path) sf_error("Need mdio="); + + char* dataname = sf_getstring("data"); + /* name of the MDIO data variable (default "seismic") */ + std::string datavar = (NULL != dataname) ? dataname : "seismic"; + + /* header-copy source dataset */ + char* srcpath = sf_getstring("headers"); + if (NULL == srcpath) srcpath = sf_getstring("like"); + /* another MDIO dataset to copy text/binary/trace headers from */ + + const char* hdrcopy = sf_getstring("hdrcopy"); + /* what to copy from headers=/like=: text, binary, trace, or all */ + if (NULL == hdrcopy) hdrcopy = "all"; + std::string hc = hdrcopy; + bool cp_text = (hc.find("text") != std::string::npos) || + (hc.find("all") != std::string::npos); + bool cp_binary = (hc.find("binary") != std::string::npos) || + (hc.find("all") != std::string::npos); + bool cp_trace = (hc.find("trace") != std::string::npos) || + (hc.find("all") != std::string::npos); + + /* ---- RSF geometry ---- */ + off_t n[SF_MAX_DIM]; + int dim = sf_largefiledims(in, n); + long ns = (long) n[0]; + long total = 1; + for (int i = 0; i < dim; i++) total *= (long) n[i]; + long ntr = total / ns; + + /* MDIO axes (slowest-first, sample axis last) = reverse of RSF axes. */ + std::vector axes(dim); + std::vector used; + for (int a = 0; a < dim; a++) { + int ri = dim - a; /* RSF axis number (1-based) */ + char key[8]; + MdioAxis ax; + ax.size = (long) n[ri - 1]; + float fo = 0., fd = 1.; + snprintf(key, sizeof(key), "o%d", ri); + ax.o = sf_histfloat(in, key, &fo) ? (double) fo : 0.; + snprintf(key, sizeof(key), "d%d", ri); + ax.d = sf_histfloat(in, key, &fd) ? (double) fd : 1.; + snprintf(key, sizeof(key), "label%d", ri); + char* lab = sf_histstring(in, key); + ax.label = clean_name(lab, ri, used); + if (lab) free(lab); + snprintf(key, sizeof(key), "unit%d", ri); + char* un = sf_histstring(in, key); + if (un) { ax.unit = un; free(un); } + ax.sample = (ri == 1); + axes[a] = ax; + } + + double o1 = axes[dim - 1].o; /* sample-axis origin -> delrt */ + + /* ---- trace headers from tfile ---- */ + std::vector keynames; + std::vector > columns; + + char* tname = sf_getstring("tfile"); + /* input trace header file (from sfsegyread or sfsegyheader) */ + if (NULL != tname) { + sf_file hin = sf_input("tfile"); + int nk; + if (!sf_histint(hin, "n1", &nk)) sf_error("No n1= in tfile"); + long thtr = sf_leftsize(hin, 1); + if (thtr != ntr) + sf_warning("tfile has %ld traces, data has %ld", thtr, ntr); + long use = (thtr < ntr) ? thtr : ntr; + + segy_init(nk, hin); + int* all = sf_intalloc(nk * (int) use); + /* read only the traces we will use */ + sf_intread(all, (size_t) nk * use, hin); + + for (int k = 0; k < nk && k < SF_NKEYS; k++) { + const char* nm = segykeyword(k); + bool nonzero = false; + std::vector col((size_t) ntr, 0); + for (long t = 0; t < use; t++) { + int v = all[(size_t) t * nk + k]; + col[(size_t) t] = v; + if (v) nonzero = true; + } + if (nonzero) { keynames.push_back(nm); columns.push_back(col); } + } + free(all); + sf_fileclose(hin); + } else { + segy_init(SF_NKEYS, NULL); + } + + /* ---- open header-copy source (if any) ---- */ + mdio::Dataset* srcds = NULL; + std::string srcHeadersVar; + if (NULL != srcpath) { + auto sf = mdio::Dataset::Open(std::string(srcpath), mdio::constants::kOpen); + if (!sf.status().ok()) sf_error("Cannot open headers source \"%s\"", srcpath); + srcds = new mdio::Dataset(sf.value()); + srcHeadersVar = mdio_headers_variable(*srcds, NULL); + } + + /* If copying trace headers, replace tfile columns with source values. */ + if (NULL != srcds && cp_trace) { + std::vector > nosl; + keynames.clear(); + columns.clear(); + for (int k = 0; k < SF_NKEYS; k++) { + const char* nm = segykeyword(k); + std::vector vals; + if (!mdio_read_header_key(*srcds, std::string(srcpath), nosl, + srcHeadersVar, nm, vals)) continue; + if ((long) vals.size() != ntr) continue; + std::vector col((size_t) ntr, 0); + for (long t = 0; t < ntr; t++) col[(size_t) t] = (int) vals[(size_t) t]; + keynames.push_back(nm); + columns.push_back(col); + } + } + + /* Force delrt to reflect the sample-axis origin (mirrors sfsegywrite). */ + { + int dk = segykey("delrt"); + bool found = false; + for (size_t i = 0; i < keynames.size(); i++) + if (segykey(keynames[i].c_str()) == dk) { + for (long t = 0; t < ntr; t++) + columns[i][(size_t) t] = (int)(1000. * o1); + found = true; + break; + } + if (!found && o1 != 0.) { + std::vector col((size_t) ntr, (int)(1000. * o1)); + keynames.push_back("delrt"); + columns.push_back(col); + } + } + + /* ---- chunking strategy / sizes ---- */ + /* axes[] is in MDIO order (slowest first, sample axis last); RSF axis ri + maps to MDIO axis a = dim - ri. */ + char* chunkstr = sf_getstring("chunk"); + /* chunk size or named strategy for the data variable: an integer applied to + every axis (0 or >= axis size means a single full-length chunk), or one of + auto (default, min(size,128) per axis), full (whole array in one chunk), or + trace (chunk only the sample axis, full-length trace axes). Per-axis + overrides chunk1=,chunk2=,... (RSF axis order; axis 1 = samples) take + precedence. */ + std::string strat = (NULL != chunkstr) ? chunkstr : "auto"; + + bool strat_named = (strat == "auto" || strat == "full" || strat == "trace"); + long strat_int = 0; + if (!strat_named) strat_int = atol(strat.c_str()); + + std::vector chunk(dim); + for (int a = 0; a < dim; a++) { + long size = axes[a].size; + long c; + if (strat == "full") { + c = size; + } else if (strat == "trace") { + c = axes[a].sample ? ((size < 128) ? size : 128) : size; + } else if (!strat_named) { /* integer default for all axes */ + c = (strat_int <= 0 || strat_int >= size) ? size : strat_int; + } else { /* "auto" */ + c = (size < 128) ? size : 128; + } + chunk[a] = c; + } + + /* per-axis overrides chunk1=, chunk2=, ... (RSF axis order) */ + for (int ri = 1; ri <= dim; ri++) { + char key[16]; + int v; + snprintf(key, sizeof(key), "chunk%d", ri); + if (sf_getint(key, &v)) { + int a = dim - ri; + long size = axes[a].size; + chunk[a] = (v <= 0 || v >= size) ? size : v; + } + } + + /* ---- build schema and create dataset ---- */ + + char* dname = sf_getstring("name"); + std::string dsname = (NULL != dname) ? dname : "Madagascar MDIO"; + + nlohmann::json schema = + mdio_build_schema(dsname, axes, datavar, keynames, chunk); + + /* text / binary headers into the dataset metadata */ + char ahead[SF_EBCBYTES]; + bool have_text = false; + if (NULL != srcds && cp_text && mdio_get_text_header(*srcds, ahead)) { + have_text = true; + } else { + char* hname = sf_getstring("hfile"); + /* input SEG-Y EBCDIC text header file */ + if (NULL != hname) { + FILE* fp = fopen(hname, "r"); + if (NULL != fp) { + memset(ahead, ' ', SF_EBCBYTES); + fread(ahead, 1, SF_EBCBYTES, fp); + fclose(fp); + have_text = true; + } + } + } + if (have_text) mdio_put_text_header(schema["metadata"], ahead); + + char bhead[SF_BNYBYTES]; + bool have_bin = false; + if (NULL != srcds && cp_binary && mdio_get_binary_header(*srcds, bhead)) { + have_bin = true; + } else { + char* bname = sf_getstring("bfile"); + /* input SEG-Y binary header file */ + if (NULL != bname) { + FILE* fp = fopen(bname, "rb"); + if (NULL != fp) { + memset(bhead, 0, SF_BNYBYTES); + fread(bhead, 1, SF_BNYBYTES, fp); + fclose(fp); + have_bin = true; + } + } + } + if (have_bin) mdio_put_binary_header(schema["metadata"], bhead); + + if (verb) sf_warning("Creating MDIO \"%s\" (%ld samples x %ld traces)", + path, ns, ntr); + + auto dsFut = mdio::Dataset::from_json(schema, std::string(path), + mdio::constants::kCreate); + if (!dsFut.status().ok()) + sf_error("Failed to create MDIO \"%s\": %s", path, + dsFut.status().ToString().c_str()); + mdio::Dataset ds = dsFut.value(); + + /* ---- write dimension coordinates (uniform o + i*d) ---- */ + for (int a = 0; a < dim; a++) { + auto vr = ds.variables.get(axes[a].label); + if (!vr.status().ok()) continue; + auto var = vr.value(); + auto vdr = mdio::from_variable(var); + if (!vdr.status().ok()) continue; + auto vd = vdr.value(); + long nc = (long) vd.num_samples(); + auto off = vd.get_flattened_offset(); + float* p = static_cast(vd.get_data_accessor().data()); + for (long i = 0; i < nc; i++) p[off + i] = (float)(axes[a].o + i * axes[a].d); + var.Write(vd); + } + + /* ---- write data (RSF order matches MDIO row-major order) ---- */ + { + float* buf = sf_floatalloc(total); + sf_floatread(buf, total, in); + if (!write_float_buf(ds, datavar, buf, total)) + sf_error("Failed to write data variable \"%s\"", datavar.c_str()); + free(buf); + } + + /* ---- write trace-header variables ---- */ + for (size_t i = 0; i < keynames.size(); i++) { + if (!write_int_var(ds, keynames[i], columns[i]) && verb) + sf_warning("Could not write header variable \"%s\"", + keynames[i].c_str()); + } + + if (verb) sf_warning("Done"); + + exit(0); +} diff --git a/system/generic/SConstruct b/system/generic/SConstruct index e6f8919118..78f730c9dd 100644 --- a/system/generic/SConstruct +++ b/system/generic/SConstruct @@ -89,6 +89,59 @@ for prog in Split('gaussshape2'): env.Object('Test' + prog + '.c') env.Program(sources,PROGPREFIX='',PROGSUFFIX='.x') +###################################################################### +# MDIO (mdio-cpp) C++ PROGRAMS +###################################################################### +# Read/write the TGS MDIO (Zarr) format through a prebuilt mdio-cpp +# (https://github.com/TGSAI/mdio-cpp). Built only when mdio-cpp was detected +# at configure time (MDIO=y), otherwise a placeholder is installed. +mdio_mains = Split('mdioread mdiowrite') + +if env.get('MDIO'): + menv = env.Clone() + + # mdio-cpp needs C++17 plus the compile definitions captured at build time + # (e.g. -DMAX_NUM_SLICES); MDIO_CXXFLAGS carries both. + mdio_cxxflags = menv.get('MDIO_CXXFLAGS','') + if mdio_cxxflags: + menv.Append(CXXFLAGS=' '+mdio_cxxflags) + elif '-std=' not in menv.get('CXXFLAGS',''): + menv.Append(CXXFLAGS=' -std=c++17') + + mdio_cpppath = menv.get('MDIO_CPPPATH',[]) + mdio_libpath = menv.get('MDIO_LIBPATH',[]) + mdio_libs = menv.get('MDIO_LIBS',['mdio']) + mdio_linkflags = menv.get('MDIO_LINKFLAGS','') + if type(mdio_cpppath) is not list: mdio_cpppath = mdio_cpppath.split(',') + if type(mdio_libpath) is not list: mdio_libpath = mdio_libpath.split(',') + if type(mdio_libs) is not list: mdio_libs = mdio_libs.split(',') + + # rsfsegy provides the SEG-Y key table and ebc2asc/asc2ebc helpers. + menv.Prepend(LIBS=[dynlib+'rsfsegy', dynlib+'su']) + menv.Append(CPPPATH=mdio_cpppath, + LIBPATH=mdio_libpath, + LIBS=mdio_libs) + # tensorstore/abseil have circular static-archive dependencies; the install's + # mdio.env wraps them in -Wl,--start-group ... -Wl,--end-group via + # MDIO_LINKFLAGS. These archives must appear AFTER the program objects on the + # link line, so append them to the end of LINKCOM (LINKFLAGS would place them + # before $SOURCES and leave the objects' mdio symbols unresolved). + if mdio_linkflags: + menv['LINKCOM'] = '%s %s' % (menv['LINKCOM'], mdio_linkflags) + + helper = menv.StaticObject('mdio2segy.cc') + + for prog in mdio_mains: + obj = menv.StaticObject('M'+prog+'.cc') + menv.Depends(obj,'mdio2segy.hh') + main = menv.Program(prog,[obj,helper]) + if root: + menv.Install(bindir,main[0]) +elif root: + for prog in mdio_mains: + main = env.RSF_Place('sf'+prog,None,var='MDIO',package='mdio-cpp') + env.Install(bindir,main) + ###################################################################### # SELF-DOCUMENTATION ###################################################################### @@ -96,6 +149,7 @@ if root: main = 'sfgeneric.py' docs = [env.Doc(prog,'M' + prog) for prog in mains] + docs += [env.Doc(prog,'M' + prog + '.cc') for prog in mdio_mains] env.Depends(docs,'#/framework/rsf/doc.py') doc = env.RSF_Docmerge(main,docs,alias=docalias) diff --git a/system/generic/mdio2segy.cc b/system/generic/mdio2segy.cc new file mode 100644 index 0000000000..f61cd3a5ba --- /dev/null +++ b/system/generic/mdio2segy.cc @@ -0,0 +1,393 @@ +/* Helpers translating between the MDIO (mdio-cpp) data model and the + Madagascar SEG-Y / RSF representation. See mdio2segy.hh. */ + +#include +#include +#include +#include +#include + +#include "mdio2segy.hh" + +/* ------------------------------------------------------------------ */ +/* SEG-Y key name -> candidate MDIO field/variable names */ +/* ------------------------------------------------------------------ */ +static std::vector header_aliases(const std::string& key) +{ + std::vector a; + a.push_back(key); + if (key == "iline") { a.push_back("inline"); a.push_back("inline_number"); + a.push_back("il"); } + else if (key == "xline") { a.push_back("crossline"); + a.push_back("crossline_number"); a.push_back("xl"); } + else if (key == "cdp") { a.push_back("cdp_number"); a.push_back("ensemble"); } + else if (key == "sx") { a.push_back("source_coord_x"); a.push_back("source_x"); + a.push_back("sou_x"); } + else if (key == "sy") { a.push_back("source_coord_y"); a.push_back("source_y"); + a.push_back("sou_y"); } + else if (key == "gx") { a.push_back("group_coord_x"); a.push_back("rec_x"); } + else if (key == "gy") { a.push_back("group_coord_y"); a.push_back("rec_y"); } + else if (key == "cdpx"){ a.push_back("cdp_x"); } + else if (key == "cdpy"){ a.push_back("cdp_y"); } + return a; +} + +/* ------------------------------------------------------------------ */ +/* Read a whole variable into doubles, trying common MDIO data types. */ +/* ------------------------------------------------------------------ */ +bool mdio_read_field(mdio::Dataset& ds, const std::string& name, + std::vector& out) +{ +#define MDIO_TRY_READ(MT) \ + do { \ + auto vr = ds.variables.get(name); \ + if (vr.status().ok()) { \ + auto var = vr.value(); \ + auto fut = var.Read(); \ + if (fut.status().ok()) { \ + auto vd = fut.value(); \ + long n = (long) vd.num_samples(); \ + auto off = vd.get_flattened_offset(); \ + const MT* p = \ + static_cast(vd.get_data_accessor().data()); \ + out.resize((size_t) n); \ + for (long i = 0; i < n; i++) \ + out[(size_t) i] = (double) p[off + i]; \ + return true; \ + } \ + } \ + } while (0) + + MDIO_TRY_READ(mdio::dtypes::int32_t); + MDIO_TRY_READ(mdio::dtypes::int16_t); + MDIO_TRY_READ(mdio::dtypes::int64_t); + MDIO_TRY_READ(mdio::dtypes::uint32_t); + MDIO_TRY_READ(mdio::dtypes::uint16_t); + MDIO_TRY_READ(mdio::dtypes::float32_t); + MDIO_TRY_READ(mdio::dtypes::float64_t); + +#undef MDIO_TRY_READ + return false; +} + +bool mdio_coord(mdio::Dataset& ds, const std::string& label, + double* o, double* d) +{ + std::vector v; + if (!mdio_read_field(ds, label, v) || v.empty()) return false; + *o = v[0]; + *d = (v.size() > 1) ? (v[1] - v[0]) : 1.0; + return true; +} + +/* ------------------------------------------------------------------ */ +/* Variable resolution */ +/* ------------------------------------------------------------------ */ +static bool has_empty_label(mdio::Dataset& ds, const std::string& name) +{ + auto vr = ds.variables.at(name); + if (!vr.status().ok()) return false; + auto dom = vr.value().dimensions(); + int rank = (int) dom.rank(); + auto labels = dom.labels(); + for (int i = 0; i < rank; i++) + if (std::string(labels[i]).empty()) return true; + return false; +} + +std::string mdio_data_variable(mdio::Dataset& ds, const char* given) +{ + if (given && *given) return std::string(given); + + auto names = ds.variables.get_iterable_accessor(); + + /* Prefer conventional data-variable names. */ + const char* preferred[] = {"seismic", "amplitude", "data", NULL}; + for (int k = 0; preferred[k]; k++) { + std::string p = preferred[k]; + if (ds.variables.contains_key(p) && !has_empty_label(ds, p) && + ds.variables.get(p).status().ok()) + return p; + } + + /* Otherwise pick the float, non-structured variable of largest rank. */ + std::string best; + int bestrank = 1; + for (size_t i = 0; i < names.size(); i++) { + const std::string& n = names[i]; + auto vr = ds.variables.at(n); + if (!vr.status().ok()) continue; + int rank = (int) vr.value().dimensions().rank(); + if (rank < 2 || has_empty_label(ds, n)) continue; + if (!ds.variables.get(n).status().ok()) continue; + if (rank > bestrank) { best = n; bestrank = rank; } + } + return best; +} + +std::string mdio_headers_variable(mdio::Dataset& ds, const char* given) +{ + if (given && *given) return std::string(given); + + const char* common[] = {"headers", "trace_headers", "TraceHeaders", NULL}; + for (int k = 0; common[k]; k++) + if (ds.variables.contains_key(common[k])) return std::string(common[k]); + + /* Any structured variable carries an unlabeled byte dimension. */ + auto names = ds.variables.get_iterable_accessor(); + for (size_t i = 0; i < names.size(); i++) + if (has_empty_label(ds, names[i])) return names[i]; + + return ""; +} + +static std::string header_field_name(mdio::Dataset& ds, const char* key) +{ + std::vector cands = header_aliases(key); + for (size_t i = 0; i < cands.size(); i++) + if (ds.variables.contains_key(cands[i])) return cands[i]; + return ""; +} + +bool mdio_read_header_key(mdio::Dataset& ds, const std::string& path, + const std::vector >& slices, + const std::string& headersVar, const char* key, + std::vector& out) +{ + /* Layout 1: one variable per SEG-Y field. */ + std::string f = header_field_name(ds, key); + if (!f.empty()) return mdio_read_field(ds, f, out); + + /* Layout 2: structured headers variable, extract field with SelectField. + SelectField mutates the dataset it is called on, so we operate on a + freshly re-opened (and re-sliced) dataset per field. */ + if (headersVar.empty()) return false; + + std::vector cands = header_aliases(key); + for (size_t i = 0; i < cands.size(); i++) { + auto f2 = mdio::Dataset::Open(path, mdio::constants::kOpen); + if (!f2.status().ok()) continue; + mdio::Dataset d2 = f2.value(); + + mdio::Dataset sl = d2; + if (!slices.empty()) { + auto r = d2.isel(slices); + if (!r.status().ok()) continue; + sl = r.value(); + } + + auto sf = sl.SelectField(headersVar, cands[i]); + if (!sf.status().ok()) continue; + + if (mdio_read_field(sl, headersVar, out)) return true; + } + return false; +} + +/* ------------------------------------------------------------------ */ +/* Axis geometry */ +/* ------------------------------------------------------------------ */ +std::vector mdio_axes(mdio::Dataset& ds, const std::string& datavar) +{ + std::vector axes; + auto vr = ds.variables.at(datavar); + if (!vr.status().ok()) return axes; + + auto dom = vr.value().dimensions(); + int rank = (int) dom.rank(); + auto labels = dom.labels(); + auto shape = dom.shape(); + + for (int i = 0; i < rank; i++) { + MdioAxis a; + a.label = std::string(labels[i]); + a.size = (long) shape[i]; + a.o = 0.0; + a.d = 1.0; + a.sample = (i == rank - 1); + double o, d; + if (mdio_coord(ds, a.label, &o, &d)) { a.o = o; a.d = d; } + axes.push_back(a); + } + return axes; +} + +/* ------------------------------------------------------------------ */ +/* Text / binary SEG-Y headers in the dataset metadata */ +/* ------------------------------------------------------------------ */ +static const nlohmann::json* find_node(const nlohmann::json& meta, + const char* name) +{ + if (meta.contains("attributes") && meta["attributes"].contains(name)) + return &meta["attributes"][name]; + if (meta.contains(name)) return &meta[name]; + return NULL; +} + +bool mdio_get_text_header(mdio::Dataset& ds, char ahead[SF_EBCBYTES]) +{ + const nlohmann::json& meta = ds.getMetadata(); + const nlohmann::json* node = find_node(meta, "textHeader"); + if (NULL == node) return false; + + memset(ahead, ' ', SF_EBCBYTES); + if (node->is_string()) { + std::string s = node->get(); + memcpy(ahead, s.data(), std::min((size_t) SF_EBCBYTES, s.size())); + } else if (node->is_array()) { + /* A list of (usually 40) 80-character cards. */ + int off = 0; + for (size_t i = 0; i < node->size() && off < SF_EBCBYTES; i++) { + std::string s = (*node)[i].get(); + int n = (int) std::min((size_t)(SF_EBCBYTES - off), + std::min((size_t) 80, s.size())); + memcpy(ahead + off, s.data(), n); + off += 80; + } + } else { + return false; + } + return true; +} + +bool mdio_get_binary_header(mdio::Dataset& ds, char bhead[SF_BNYBYTES]) +{ + const nlohmann::json& meta = ds.getMetadata(); + const nlohmann::json* node = find_node(meta, "binaryHeader"); + if (NULL == node) return false; + + memset(bhead, 0, SF_BNYBYTES); + if (node->is_array()) { + for (size_t i = 0; i < node->size() && i < (size_t) SF_BNYBYTES; i++) + bhead[i] = (char) (int) (*node)[i].get(); + } else if (node->is_string()) { + std::string s = node->get(); + memcpy(bhead, s.data(), std::min((size_t) SF_BNYBYTES, s.size())); + } else { + return false; + } + return true; +} + +void mdio_put_text_header(nlohmann::json& metadata, + const char ahead[SF_EBCBYTES]) +{ + metadata["attributes"]["textHeader"] = + std::string(ahead, (size_t) SF_EBCBYTES); +} + +void mdio_put_binary_header(nlohmann::json& metadata, + const char bhead[SF_BNYBYTES]) +{ + nlohmann::json arr = nlohmann::json::array(); + for (int i = 0; i < SF_BNYBYTES; i++) + arr.push_back((int) (unsigned char) bhead[i]); + metadata["attributes"]["binaryHeader"] = arr; +} + +/* ------------------------------------------------------------------ */ +/* Schema construction for new datasets */ +/* ------------------------------------------------------------------ */ +static std::string iso_now(void) +{ + char buf[32]; + time_t t = time(NULL); + struct tm* g = gmtime(&t); + strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S.000000Z", g); + return std::string(buf); +} + +nlohmann::json mdio_build_schema(const std::string& name, + const std::vector& axes, + const std::string& datavar, + const std::vector& keys, + const std::vector& chunk) +{ + nlohmann::json schema; + schema["metadata"]["apiVersion"] = "1.0.0"; + schema["metadata"]["name"] = name; + schema["metadata"]["createdOn"] = iso_now(); + + nlohmann::json vars = nlohmann::json::array(); + + /* Dimension coordinate variables (uniform: o + i*d). */ + for (size_t i = 0; i < axes.size(); i++) { + nlohmann::json v; + v["name"] = axes[i].label; + v["dataType"] = "float32"; + nlohmann::json dim; + dim["name"] = axes[i].label; + dim["size"] = axes[i].size; + v["dimensions"] = nlohmann::json::array({dim}); + /* NB: the MDIO v1 schema models unitsV1 as a structured quantity object + (e.g. {"time":"s"}, {"length":"m"}); a bare RSF unit string fails + validation, and the SEG-Y round trip does not depend on axis units, so + we omit unitsV1 here rather than risk an invalid schema. */ + vars.push_back(v); + } + + /* Trace-grid dimension labels (all axes but the sample axis). */ + std::vector tracelabels; + std::vector traceaxis; /* source axis index of each label */ + for (size_t i = 0; i < axes.size(); i++) + if (!axes[i].sample) { + tracelabels.push_back(axes[i].label); + traceaxis.push_back(i); + } + + /* Per-trace-axis chunk sizes for the header variables: leave the fast trace + axis (the last trace label, = RSF axis 2) as a single full-length chunk and + chunk the slower trace axes with the resolved data-variable chunk sizes. */ + std::vector tracechunk(tracelabels.size()); + for (size_t k = 0; k < tracelabels.size(); k++) { + size_t a = traceaxis[k]; + long size = axes[a].size; + long c; + if (k + 1 == tracelabels.size()) /* fast trace axis -> full */ + c = size; + else + c = (a < chunk.size() && chunk[a] > 0 && chunk[a] <= size) + ? chunk[a] : size; + tracechunk[k] = c; + } + + /* Data variable. */ + { + nlohmann::json v; + v["name"] = datavar; + v["dataType"] = "float32"; + nlohmann::json dims = nlohmann::json::array(); + for (size_t i = 0; i < axes.size(); i++) dims.push_back(axes[i].label); + v["dimensions"] = dims; + if (!chunk.empty()) { + nlohmann::json cs = nlohmann::json::array(); + for (size_t i = 0; i < chunk.size(); i++) cs.push_back(chunk[i]); + v["metadata"]["chunkGrid"]["name"] = "regular"; + v["metadata"]["chunkGrid"]["configuration"]["chunkShape"] = cs; + } + vars.push_back(v); + } + + /* One int32 trace-header variable per SEG-Y key. */ + for (size_t k = 0; k < keys.size(); k++) { + nlohmann::json v; + v["name"] = keys[k]; + v["dataType"] = "int32"; + nlohmann::json dims = nlohmann::json::array(); + for (size_t i = 0; i < tracelabels.size(); i++) + dims.push_back(tracelabels[i]); + v["dimensions"] = dims; + /* Chunk only when there is a slow trace axis to chunk; a lone (fast) + trace dimension is left unchunked, matching the 2D default. */ + if (tracelabels.size() >= 2) { + nlohmann::json cs = nlohmann::json::array(); + for (size_t i = 0; i < tracechunk.size(); i++) cs.push_back(tracechunk[i]); + v["metadata"]["chunkGrid"]["name"] = "regular"; + v["metadata"]["chunkGrid"]["configuration"]["chunkShape"] = cs; + } + vars.push_back(v); + } + + schema["variables"] = vars; + return schema; +} diff --git a/system/generic/mdio2segy.hh b/system/generic/mdio2segy.hh new file mode 100644 index 0000000000..c12d933583 --- /dev/null +++ b/system/generic/mdio2segy.hh @@ -0,0 +1,97 @@ +/* Shared helpers translating between the MDIO (mdio-cpp) data model and + the Madagascar SEG-Y / RSF representation used by sfmdioread and + sfmdiowrite. */ + +#ifndef _mdio2segy_hh +#define _mdio2segy_hh + +#include +#include + +#include +#include + +extern "C" { +#include +#include +} + +/* Geometry of one MDIO dimension, kept in the data variable's native order + (slowest first, the sample axis last). */ +struct MdioAxis { + std::string label; + long size; /* full extent in the file */ + double o; /* origin (first dimension-coordinate value) */ + double d; /* sampling (coordinate spacing) */ + std::string unit; + bool sample; /* true for the fastest axis -> RSF n1 */ +}; + +/* Resolve the principal (sample-bearing) data variable. When "given" is + non-empty it is returned verbatim; otherwise the first floating-point, + non-coordinate variable is chosen. Returns "" when none is found. */ +std::string mdio_data_variable(mdio::Dataset& ds, const char* given); + +/* Resolve the per-trace headers variable. When "given" is non-empty it is + returned verbatim; otherwise a variable named headers/trace_headers, or any + structured variable (one carrying an unlabeled "byte" dimension), is used. + Returns "" when none is found. */ +std::string mdio_headers_variable(mdio::Dataset& ds, const char* given); + +/* Read the values of SEG-Y key "key" over the (already sliced) trace grid. + Supports two MDIO header layouts: + 1. one int/float variable per SEG-Y field (read directly from "ds"); and + 2. a single structured headers variable, whose field is extracted with + Dataset::SelectField after re-opening and re-slicing "path"/"slices". + Returns false when no matching field is present. */ +bool mdio_read_header_key(mdio::Dataset& ds, const std::string& path, + const std::vector >& slices, + const std::string& headersVar, const char* key, + std::vector& out); + +/* Geometry of every dimension of the data variable, in MDIO order. */ +std::vector mdio_axes(mdio::Dataset& ds, const std::string& datavar); + +/* Read a full variable (coordinate or trace-header field) into a double + buffer, trying the common integer/float MDIO data types. The flattened + slice offset is applied, so the returned values are in logical order. + Returns false when the variable is absent or unreadable. */ +bool mdio_read_field(mdio::Dataset& ds, const std::string& name, + std::vector& out); + +/* Origin/sampling of one dimension, derived from its coordinate variable. + Returns false when no usable coordinate variable exists. */ +bool mdio_coord(mdio::Dataset& ds, const std::string& label, + double* o, double* d); + +/* SEG-Y text (3200-byte EBCDIC) and binary (400-byte) headers stored in the + dataset-level metadata attributes. Each returns false when absent. */ +bool mdio_get_text_header(mdio::Dataset& ds, char ahead[SF_EBCBYTES]); +bool mdio_get_binary_header(mdio::Dataset& ds, char bhead[SF_BNYBYTES]); + +/* Embed SEG-Y text/binary headers into a schema metadata object so they are + persisted with a newly created MDIO dataset. */ +void mdio_put_text_header(nlohmann::json& metadata, + const char ahead[SF_EBCBYTES]); +void mdio_put_binary_header(nlohmann::json& metadata, + const char bhead[SF_BNYBYTES]); + +/* Resolve the MDIO variable name that stores values for SEG-Y key "key", + honoring a small alias table for common MDIO field names. Returns "" when + the dataset exposes no matching trace-header variable. */ +std::string mdio_header_field(mdio::Dataset& ds, const char* key); + +/* Build an MDIO v1 schema (nlohmann::json) for a new dataset: + - axes: dimension geometry in MDIO order (sample axis last); + - datavar: name/dtype of the float data variable; + - keys: SEG-Y key names to materialize as int32 trace-header variables + (those sharing the non-sample trace dimensions); + - chunk: chunk shape (same rank/order as axes); + - name: dataset name. */ +nlohmann::json mdio_build_schema(const std::string& name, + const std::vector& axes, + const std::string& datavar, + const std::vector& keys, + const std::vector& chunk); + +#endif From d71edabb7563cbe766ea739819ebb87c923970f2 Mon Sep 17 00:00:00 2001 From: Mark Roberts Date: Tue, 2 Jun 2026 20:50:16 +0000 Subject: [PATCH 2/3] Update to mdio for python compatiability --- system/generic/Mmdiowrite.cc | 4 +- system/generic/mdio2segy.cc | 95 ++++++++++++++++++++++++++++++++---- system/generic/mdio2segy.hh | 16 +++--- 3 files changed, 97 insertions(+), 18 deletions(-) diff --git a/system/generic/Mmdiowrite.cc b/system/generic/Mmdiowrite.cc index 6769188569..11a308c1a9 100644 --- a/system/generic/Mmdiowrite.cc +++ b/system/generic/Mmdiowrite.cc @@ -303,7 +303,7 @@ int main(int argc, char* argv[]) } } } - if (have_text) mdio_put_text_header(schema["metadata"], ahead); + if (have_text) mdio_put_text_header(schema, ahead); char bhead[SF_BNYBYTES]; bool have_bin = false; @@ -322,7 +322,7 @@ int main(int argc, char* argv[]) } } } - if (have_bin) mdio_put_binary_header(schema["metadata"], bhead); + if (have_bin) mdio_put_binary_header(schema, bhead); if (verb) sf_warning("Creating MDIO \"%s\" (%ld samples x %ld traces)", path, ns, ntr); diff --git a/system/generic/mdio2segy.cc b/system/generic/mdio2segy.cc index f61cd3a5ba..b8c2b53620 100644 --- a/system/generic/mdio2segy.cc +++ b/system/generic/mdio2segy.cc @@ -95,6 +95,21 @@ static bool has_empty_label(mdio::Dataset& ds, const std::string& name) return false; } +/* True for a 1-D dimension coordinate (rank 1 whose single dimension label + equals the variable name, e.g. "inline" over dim "inline"). Such variables + hold axis values, not per-trace headers, so trace-header lookup must skip + them: in MDIO v1 the per-trace geometry lives in the structured "headers" + variable, while the same-named dimension coordinate only spans the axis. */ +static bool is_dim_coordinate(mdio::Dataset& ds, const std::string& name) +{ + auto vr = ds.variables.at(name); + if (!vr.status().ok()) return false; + auto dom = vr.value().dimensions(); + if ((int) dom.rank() != 1) return false; + auto labels = dom.labels(); + return std::string(labels[0]) == name; +} + std::string mdio_data_variable(mdio::Dataset& ds, const char* given) { if (given && *given) return std::string(given); @@ -145,7 +160,9 @@ static std::string header_field_name(mdio::Dataset& ds, const char* key) { std::vector cands = header_aliases(key); for (size_t i = 0; i < cands.size(); i++) - if (ds.variables.contains_key(cands[i])) return cands[i]; + if (ds.variables.contains_key(cands[i]) && + !is_dim_coordinate(ds, cands[i])) + return cands[i]; return ""; } @@ -155,6 +172,7 @@ bool mdio_read_header_key(mdio::Dataset& ds, const std::string& path, std::vector& out) { /* Layout 1: one variable per SEG-Y field. */ + std::string f = header_field_name(ds, key); if (!f.empty()) return mdio_read_field(ds, f, out); @@ -213,11 +231,34 @@ std::vector mdio_axes(mdio::Dataset& ds, const std::string& datavar) } /* ------------------------------------------------------------------ */ -/* Text / binary SEG-Y headers in the dataset metadata */ +/* Text / binary SEG-Y headers in the segy_file_header variable */ /* ------------------------------------------------------------------ */ +/* MDIO v1 stores the SEG-Y file headers as attributes of a scalar + "segy_file_header" variable. mdio-cpp surfaces a variable's user attributes + through Variable::getMetadata() under metadata/attributes (the same place we + write them in mdio_put_*); we also tolerate the bare .zattrs layout the + Python mdio package uses (textHeader/binaryHeader at the top level). */ +static const char* SEGY_FILE_HEADER_VAR = "segy_file_header"; + +static bool segy_file_header_meta(mdio::Dataset& ds, nlohmann::json& out) +{ + auto vr = ds.variables.at(SEGY_FILE_HEADER_VAR); + if (!vr.status().ok()) return false; + out = vr.value().getMetadata(); + return true; +} + static const nlohmann::json* find_node(const nlohmann::json& meta, const char* name) { + /* mdio-cpp shape: { "metadata": { "attributes": { : ... } } } */ + if (meta.contains("metadata") && meta["metadata"].is_object()) { + const nlohmann::json& m = meta["metadata"]; + if (m.contains("attributes") && m["attributes"].contains(name)) + return &m["attributes"][name]; + if (m.contains(name)) return &m[name]; + } + /* bare .zattrs shape (Python mdio): { "attributes": {...} } or top level. */ if (meta.contains("attributes") && meta["attributes"].contains(name)) return &meta["attributes"][name]; if (meta.contains(name)) return &meta[name]; @@ -226,13 +267,20 @@ static const nlohmann::json* find_node(const nlohmann::json& meta, bool mdio_get_text_header(mdio::Dataset& ds, char ahead[SF_EBCBYTES]) { - const nlohmann::json& meta = ds.getMetadata(); + nlohmann::json meta; + if (!segy_file_header_meta(ds, meta)) return false; const nlohmann::json* node = find_node(meta, "textHeader"); if (NULL == node) return false; memset(ahead, ' ', SF_EBCBYTES); if (node->is_string()) { - std::string s = node->get(); + /* A single string; the Python mdio package joins the 40 cards with + newlines, so drop those to recover the flat 3200-byte text. */ + std::string raw = node->get(); + std::string s; + s.reserve(raw.size()); + for (size_t i = 0; i < raw.size(); i++) + if (raw[i] != '\n' && raw[i] != '\r') s.push_back(raw[i]); memcpy(ahead, s.data(), std::min((size_t) SF_EBCBYTES, s.size())); } else if (node->is_array()) { /* A list of (usually 40) 80-character cards. */ @@ -252,7 +300,8 @@ bool mdio_get_text_header(mdio::Dataset& ds, char ahead[SF_EBCBYTES]) bool mdio_get_binary_header(mdio::Dataset& ds, char bhead[SF_BNYBYTES]) { - const nlohmann::json& meta = ds.getMetadata(); + nlohmann::json meta; + if (!segy_file_header_meta(ds, meta)) return false; const nlohmann::json* node = find_node(meta, "binaryHeader"); if (NULL == node) return false; @@ -269,20 +318,46 @@ bool mdio_get_binary_header(mdio::Dataset& ds, char bhead[SF_BNYBYTES]) return true; } -void mdio_put_text_header(nlohmann::json& metadata, +/* Return a mutable reference to the attributes object of the segy_file_header + variable in a new-dataset schema, creating the variable (a 1-element scalar + int32 over its own dimension) on first use. */ +static nlohmann::json& segy_file_header_attributes(nlohmann::json& schema) +{ + if (!schema.contains("variables") || !schema["variables"].is_array()) + schema["variables"] = nlohmann::json::array(); + + nlohmann::json& vars = schema["variables"]; + for (size_t i = 0; i < vars.size(); i++) + if (vars[i].contains("name") && vars[i]["name"] == SEGY_FILE_HEADER_VAR) + return vars[i]["metadata"]["attributes"]; + + nlohmann::json v; + v["name"] = SEGY_FILE_HEADER_VAR; + v["dataType"] = "int32"; + nlohmann::json dim; + dim["name"] = SEGY_FILE_HEADER_VAR; + dim["size"] = 1; + v["dimensions"] = nlohmann::json::array({dim}); + v["metadata"]["attributes"] = nlohmann::json::object(); + vars.push_back(v); + return vars[vars.size() - 1]["metadata"]["attributes"]; +} + +void mdio_put_text_header(nlohmann::json& schema, const char ahead[SF_EBCBYTES]) { - metadata["attributes"]["textHeader"] = - std::string(ahead, (size_t) SF_EBCBYTES); + nlohmann::json& attrs = segy_file_header_attributes(schema); + attrs["textHeader"] = std::string(ahead, (size_t) SF_EBCBYTES); } -void mdio_put_binary_header(nlohmann::json& metadata, +void mdio_put_binary_header(nlohmann::json& schema, const char bhead[SF_BNYBYTES]) { nlohmann::json arr = nlohmann::json::array(); for (int i = 0; i < SF_BNYBYTES; i++) arr.push_back((int) (unsigned char) bhead[i]); - metadata["attributes"]["binaryHeader"] = arr; + nlohmann::json& attrs = segy_file_header_attributes(schema); + attrs["binaryHeader"] = arr; } /* ------------------------------------------------------------------ */ diff --git a/system/generic/mdio2segy.hh b/system/generic/mdio2segy.hh index c12d933583..1673f14473 100644 --- a/system/generic/mdio2segy.hh +++ b/system/generic/mdio2segy.hh @@ -64,16 +64,20 @@ bool mdio_read_field(mdio::Dataset& ds, const std::string& name, bool mdio_coord(mdio::Dataset& ds, const std::string& label, double* o, double* d); -/* SEG-Y text (3200-byte EBCDIC) and binary (400-byte) headers stored in the - dataset-level metadata attributes. Each returns false when absent. */ +/* SEG-Y text (3200-byte) and binary (400-byte) headers. In the MDIO v1 layout + they live in the metadata (.zattrs "attributes") of the scalar + "segy_file_header" variable: textHeader (string) and binaryHeader (byte + array). Each returns false when absent. */ bool mdio_get_text_header(mdio::Dataset& ds, char ahead[SF_EBCBYTES]); bool mdio_get_binary_header(mdio::Dataset& ds, char bhead[SF_BNYBYTES]); -/* Embed SEG-Y text/binary headers into a schema metadata object so they are - persisted with a newly created MDIO dataset. */ -void mdio_put_text_header(nlohmann::json& metadata, +/* Embed SEG-Y text/binary headers into a new-dataset schema (the whole schema, + i.e. the object with "variables"). They are attached as attributes of the + v1 "segy_file_header" variable (created on first use), matching where the + Python mdio package stores them. */ +void mdio_put_text_header(nlohmann::json& schema, const char ahead[SF_EBCBYTES]); -void mdio_put_binary_header(nlohmann::json& metadata, +void mdio_put_binary_header(nlohmann::json& schema, const char bhead[SF_BNYBYTES]); /* Resolve the MDIO variable name that stores values for SEG-Y key "key", From b45ee78f8e89b6127aebec98f0fdb461019339da Mon Sep 17 00:00:00 2001 From: Mark Roberts Date: Fri, 5 Jun 2026 19:12:15 +0000 Subject: [PATCH 3/3] Fix linting issues. --- framework/configure.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/framework/configure.py b/framework/configure.py index 7f40c4ee25..227950d244 100644 --- a/framework/configure.py +++ b/framework/configure.py @@ -1923,11 +1923,11 @@ def mdio(context): linkflags = context.env.get('MDIO_LINKFLAGS', os.environ.get('MDIO_LINKFLAGS')) cxxflags = context.env.get('MDIO_CXXFLAGS', os.environ.get('MDIO_CXXFLAGS')) - if cpppath and type(cpppath) is not list: + if cpppath and not isinstance(cpppath, list): cpppath = cpppath.split(',') - if libpath and type(libpath) is not list: + if libpath and not isinstance(libpath, list): libpath = libpath.split(',') - if libs and type(libs) is not list: + if libs and not isinstance(libs, list): libs = libs.split(',') if not cpppath: