diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index 23b61a1c..646e6aa9 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -701,69 +701,71 @@ private void flushZoneMaps() throws IOException { if (chunks.isEmpty()) { continue; } - int nZones = chunks.size(); - boolean[] allValid = new boolean[nZones]; - java.util.Arrays.fill(allValid, true); - - // NULL_COUNT is computable for every column type; MIN/MAX only for fixed-width - // primitives whose chunks all carry stats. Field/bit order follows - // ZonedStatsSchema: MAX(3), MIN(4), NULL_COUNT(6); each stat field is nullable. - DType colDtype = schema.fieldTypes().get(schema.fieldNames().indexOf(colName)); - boolean hasMinMax = colDtype instanceof DType.Primitive - && chunks.stream().allMatch(ChunkRef::hasStats); - - List names = new java.util.ArrayList<>(); - List types = new java.util.ArrayList<>(); - List fields = new java.util.ArrayList<>(); - if (hasMinMax) { - PType ptype = ((DType.Primitive) colDtype).ptype(); - DType nullablePrim = new DType.Primitive(ptype, true); - boolean[] notTruncated = new boolean[nZones]; - names.add("max"); - types.add(nullablePrim); - fields.add(new NullableData(statColumn(ptype, chunks, true), allValid.clone())); - names.add("max_is_truncated"); - types.add(new DType.Bool(false)); - fields.add(notTruncated); - names.add("min"); - types.add(nullablePrim); - fields.add(new NullableData(statColumn(ptype, chunks, false), allValid.clone())); - names.add("min_is_truncated"); - types.add(new DType.Bool(false)); - fields.add(notTruncated.clone()); - } - long[] nullCounts = new long[nZones]; - for (int i = 0; i < nZones; i++) { + DType minMaxDtype = zoneMinMaxDtype(columnDtype(colName)); + boolean hasMinMax = minMaxDtype != null && chunks.stream().allMatch(ChunkRef::hasStats); + long[] nullCounts = new long[chunks.size()]; + for (int i = 0; i < chunks.size(); i++) { nullCounts[i] = chunks.get(i).nullCount(); } - names.add("null_count"); - types.add(new DType.Primitive(PType.U64, true)); - fields.add(new NullableData(nullCounts, allValid.clone())); - - DType.Struct statsDtype = new DType.Struct(List.copyOf(names), List.copyOf(types), false); - int zonesSegIdx = writeSegment(statsDtype, new StructData(fields), new StructEncodingEncoder()); - zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), hasMinMax)); + emitZoneMap(colName, hasMinMax ? minMaxDtype : null, + chunks.stream().map(ChunkRef::statsMin).toList(), + chunks.stream().map(ChunkRef::statsMax).toList(), + nullCounts); } - // Dict-encoded columns live in a separate path (one zone per code chunk); they carry - // NULL_COUNT only for now (no dict-level MIN/MAX yet). Matches Rust, which zone-maps dict - // columns (vortex.stats wrapping vortex.dict). + // Dict-encoded columns (one zone per code chunk). MIN/MAX come from each chunk's logical + // values (computed at dict-build time); NULL_COUNT always. Matches Rust, whose zone-map + // stats are computed on the logical column dtype, independent of the dict encoding. for (Map.Entry e : dictColRefs.entrySet()) { - // A dict column always has at least one code chunk, so null counts are non-empty. - long[] nullCounts = e.getValue().chunkNullCounts().stream().mapToLong(Long::longValue).toArray(); - writeNullCountZoneMap(e.getKey(), nullCounts); + DictColRef ref = e.getValue(); + DType minMaxDtype = zoneMinMaxDtype(columnDtype(e.getKey())); + boolean hasMinMax = minMaxDtype != null + && ref.chunkStatsMin().stream().allMatch(java.util.Objects::nonNull) + && ref.chunkStatsMax().stream().allMatch(java.util.Objects::nonNull); + long[] nullCounts = ref.chunkNullCounts().stream().mapToLong(Long::longValue).toArray(); + emitZoneMap(e.getKey(), hasMinMax ? minMaxDtype : null, + ref.chunkStatsMin(), ref.chunkStatsMax(), nullCounts); } } - /// Emits a NULL_COUNT-only `vortex.stats` zone-map (one zone per chunk) for `colName`. - private void writeNullCountZoneMap(String colName, long[] nullCounts) throws IOException { + private DType columnDtype(String colName) { + return schema.fieldTypes().get(schema.fieldNames().indexOf(colName)); + } + + /// Writes one `vortex.stats` zone-map for `colName`: one zone per chunk, with NULL_COUNT always + /// and MAX/MIN (plus always-false `_is_truncated` flags) when `minMaxDtype` is non-null. + /// `minBytes`/`maxBytes` hold each zone's serialised min/max scalar — read only when + /// `minMaxDtype` is set. Field/bit order follows ZonedStatsSchema: MAX(3), MIN(4), NULL_COUNT(6). + private void emitZoneMap(String colName, DType minMaxDtype, + List minBytes, List maxBytes, long[] nullCounts) throws IOException { int nZones = nullCounts.length; boolean[] allValid = new boolean[nZones]; java.util.Arrays.fill(allValid, true); - DType.Struct statsDtype = new DType.Struct( - List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false); - StructData sd = new StructData(List.of(new NullableData(nullCounts, allValid))); - int zonesSegIdx = writeSegment(statsDtype, sd, new StructEncodingEncoder()); - zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), false)); + + List names = new java.util.ArrayList<>(); + List types = new java.util.ArrayList<>(); + List fields = new java.util.ArrayList<>(); + if (minMaxDtype != null) { + boolean[] notTruncated = new boolean[nZones]; + names.add("max"); + types.add(minMaxDtype); + fields.add(new NullableData(zoneStatValues(minMaxDtype, maxBytes), allValid.clone())); + names.add("max_is_truncated"); + types.add(new DType.Bool(false)); + fields.add(notTruncated); + names.add("min"); + types.add(minMaxDtype); + fields.add(new NullableData(zoneStatValues(minMaxDtype, minBytes), allValid.clone())); + names.add("min_is_truncated"); + types.add(new DType.Bool(false)); + fields.add(notTruncated.clone()); + } + names.add("null_count"); + types.add(new DType.Primitive(PType.U64, true)); + fields.add(new NullableData(nullCounts, allValid.clone())); + + DType.Struct statsDtype = new DType.Struct(List.copyOf(names), List.copyOf(types), false); + int zonesSegIdx = writeSegment(statsDtype, new StructData(fields), new StructEncodingEncoder()); + zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), minMaxDtype != null)); } /// Wraps a column's data layout in a `vortex.stats` (zoned) layout when a zone-map was @@ -804,50 +806,84 @@ private static long countNulls(boolean[] validity) { return nulls; } - /// Builds the per-zone min (or max) values array in the storage shape the primitive encoder - /// expects, decoding each chunk's serialised [ScalarValue] stat. - private static Object statColumn(PType ptype, List chunks, boolean max) throws IOException { - int n = chunks.size(); + /// The (nullable) dtype a zone-map stores per-zone min/max in for `dtype`, or `null` when the + /// column has no recordable min/max. Primitives store the primitive; extension columns unwrap + /// to their storage primitive (`ExtEncoding` propagates the storage min/max scalars unchanged); + /// Utf8 stores the full string value. Matches [ZonedStatsSchema#statDtype]. Binary is excluded: + /// `vortex.varbin` records its min/max as string scalars, not `bytes`. + private static DType zoneMinMaxDtype(DType dtype) { + return switch (dtype) { + case DType.Primitive p -> p.withNullable(true); + case DType.Extension ext when ext.storageDType() instanceof DType.Primitive p -> p.withNullable(true); + case DType.Utf8 u -> u.withNullable(true); + default -> null; + }; + } + + /// Builds the per-zone min (or max) values array for the resolved min/max `dtype`, decoding each + /// zone's serialised [ScalarValue] stat into the array shape its encoder expects. + private static Object zoneStatValues(DType minMaxDtype, List statBytes) throws IOException { + return switch (minMaxDtype) { + case DType.Primitive p -> statColumn(p.ptype(), statBytes); + case DType.Utf8 ignored -> statStringColumn(statBytes); + default -> throw new IllegalStateException("no zone stat values for " + minMaxDtype); + }; + } + + /// Builds the per-zone string array by decoding each zone's serialised string [ScalarValue] + /// stat. Used for Utf8 columns whose `vortex.varbin` encoder records full string min/max scalars. + private static String[] statStringColumn(List statBytes) throws IOException { + String[] out = new String[statBytes.size()]; + for (int i = 0; i < out.length; i++) { + out[i] = decodeScalar(statBytes.get(i)).string_value(); + } + return out; + } + + /// Builds the per-zone values array in the storage shape the primitive encoder expects, decoding + /// each zone's serialised [ScalarValue] stat. + private static Object statColumn(PType ptype, List statBytes) throws IOException { + int n = statBytes.size(); return switch (ptype) { case I8, U8 -> { byte[] a = new byte[n]; for (int i = 0; i < n; i++) { - a[i] = (byte) scalarLong(chunks.get(i), max); + a[i] = (byte) scalarLong(statBytes.get(i)); } yield a; } case I16, U16 -> { short[] a = new short[n]; for (int i = 0; i < n; i++) { - a[i] = (short) scalarLong(chunks.get(i), max); + a[i] = (short) scalarLong(statBytes.get(i)); } yield a; } case I32, U32 -> { int[] a = new int[n]; for (int i = 0; i < n; i++) { - a[i] = (int) scalarLong(chunks.get(i), max); + a[i] = (int) scalarLong(statBytes.get(i)); } yield a; } case I64, U64 -> { long[] a = new long[n]; for (int i = 0; i < n; i++) { - a[i] = scalarLong(chunks.get(i), max); + a[i] = scalarLong(statBytes.get(i)); } yield a; } case F32 -> { float[] a = new float[n]; for (int i = 0; i < n; i++) { - a[i] = (float) scalarDouble(chunks.get(i), max); + a[i] = (float) scalarDouble(statBytes.get(i)); } yield a; } case F64 -> { double[] a = new double[n]; for (int i = 0; i < n; i++) { - a[i] = scalarDouble(chunks.get(i), max); + a[i] = scalarDouble(statBytes.get(i)); } yield a; } @@ -855,22 +891,22 @@ private static Object statColumn(PType ptype, List chunks, boolean max // F16 min/max are serialised as f32 scalars; re-pack to float16 storage. short[] a = new short[n]; for (int i = 0; i < n; i++) { - a[i] = Float.floatToFloat16((float) scalarDouble(chunks.get(i), max)); + a[i] = Float.floatToFloat16((float) scalarDouble(statBytes.get(i))); } yield a; } }; } - private static long scalarLong(ChunkRef cr, boolean max) throws IOException { + private static long scalarLong(byte[] bytes) throws IOException { // Integer columns serialise min/max as int64 (signed) or uint64 (unsigned). - ScalarValue sv = decodeScalar(max ? cr.statsMax() : cr.statsMin()); + ScalarValue sv = decodeScalar(bytes); return sv.int64_value() != null ? sv.int64_value() : sv.uint64_value(); } - private static double scalarDouble(ChunkRef cr, boolean max) throws IOException { + private static double scalarDouble(byte[] bytes) throws IOException { // Float columns serialise min/max as f64 (F64) or f32 (F32). - ScalarValue sv = decodeScalar(max ? cr.statsMax() : cr.statsMin()); + ScalarValue sv = decodeScalar(bytes); return sv.f64_value() != null ? sv.f64_value() : sv.f32_value(); } @@ -1061,16 +1097,25 @@ private void writeGlobalDictColumn(String colName, DType.Primitive dtype, List codesSegIdxes = new ArrayList<>(); List chunkRowCounts = new ArrayList<>(); List chunkNullCounts = new ArrayList<>(); + List chunkStatsMin = new ArrayList<>(); + List chunkStatsMax = new ArrayList<>(); for (Object chunk : chunks) { int len = primitiveArrayLen(chunk, ptype); Object codesArr = buildCodesArray(chunk, ptype, valueMap, codePType, len); codesSegIdxes.add(writeSegment(codesDtype, codesArr)); chunkRowCounts.add((long) len); chunkNullCounts.add(chunk instanceof NullableData nd ? countNulls(nd.validity()) : 0L); + // Per-zone min/max over the chunk's logical values (matches the flat primitive path: + // computed on nd.values(), placeholders included). Lets the dict zone-map prune like a + // plain primitive column. + Object values = chunk instanceof NullableData nd ? nd.values() : chunk; + byte[][] mm = PrimitiveEncodingEncoder.minMaxStats(ptype, values); + chunkStatsMin.add(mm != null ? mm[0] : null); + chunkStatsMax.add(mm != null ? mm[1] : null); } - dictColRefs.put(colName, - new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, chunkRowCounts, chunkNullCounts)); + dictColRefs.put(colName, new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, + chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax)); } private void writeGlobalDictUtf8Column(String colName, DType.Utf8 dtype, List chunks) @@ -1107,15 +1152,22 @@ private void writeGlobalDictUtf8Column(String colName, DType.Utf8 dtype, List codesSegIdxes = new ArrayList<>(); List chunkRowCounts = new ArrayList<>(); List chunkNullCounts = new ArrayList<>(); + List chunkStatsMin = new ArrayList<>(); + List chunkStatsMax = new ArrayList<>(); for (Object chunk : chunks) { String[] strs = (String[]) chunk; Object codesArr = buildUtf8CodesArray(strs, valueMap, codePType); codesSegIdxes.add(writeSegment(codesDtype, codesArr)); chunkRowCounts.add((long) strs.length); chunkNullCounts.add(0L); // global-dict Utf8 columns are dense (non-nullable) + // Per-zone string min/max over the chunk's values (matches the flat varbin path), so the + // dict zone-map prunes like a plain Utf8 column. + byte[][] mm = VarBinEncodingEncoder.minMaxStats(strs); + chunkStatsMin.add(mm != null ? mm[0] : null); + chunkStatsMax.add(mm != null ? mm[1] : null); } - dictColRefs.put(colName, - new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, chunkRowCounts, chunkNullCounts)); + dictColRefs.put(colName, new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, + chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax)); } private static Object buildUtf8CodesArray(String[] strs, Map valueMap, PType codePType) { @@ -1320,7 +1372,8 @@ private record ZoneMapRef(int zonesSegIdx, long nZones, long zoneLen, boolean ha } private record DictColRef(int valuesSegIdx, long valuesLen, List codesSegIdxes, - List chunkRowCounts, List chunkNullCounts) { + List chunkRowCounts, List chunkNullCounts, + List chunkStatsMin, List chunkStatsMax) { long totalRows() { return chunkRowCounts.stream().mapToLong(Long::longValue).sum(); } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java index abf2c47c..58b29ea1 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java @@ -32,7 +32,7 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { MemorySegment seg = encodePrimitive(ptype, data, ctx.arena()); byte[] min = null; byte[] max = null; - byte[][] stats = computeStats(ptype, data); + byte[][] stats = minMaxStats(ptype, data); if (stats != null) { min = stats[0]; max = stats[1]; @@ -86,7 +86,15 @@ private static MemorySegment encodePrimitive(PType ptype, Object data, Arena are }; } - private static byte[][] computeStats(PType ptype, Object data) { + /// Computes the serialised min/max [io.github.dfa1.vortex.proto.ScalarValue] pair for a raw + /// primitive array, in the same signed/unsigned/float shape the per-segment stats use. Returns + /// `null` for an empty array. Shared so the dictionary zone-map path computes per-chunk min/max + /// identically to the flat path. + /// + /// @param ptype the primitive type of `data` + /// @param data the raw primitive array (e.g. `long[]`, `int[]`, `String`-free) + /// @return a two-element `{min, max}` array of encoded scalars, or `null` if `data` is empty + public static byte[][] minMaxStats(PType ptype, Object data) { return switch (ptype) { case I8 -> { byte[] arr = (byte[]) data; diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java index b6d7b026..fe1c31ce 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VarBinEncodingEncoder.java @@ -56,6 +56,24 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { byte[] metaBytes = new VarBinMetadata(io.github.dfa1.vortex.proto.PType.fromValue(PType.I64.ordinal())).encode(); + byte[][] stats = minMaxStats(strings); + byte[] statsMin = stats != null ? stats[0] : null; + byte[] statsMax = stats != null ? stats[1] : null; + + EncodeNode offsetsNode = EncodeNode.leaf(EncodingId.VORTEX_PRIMITIVE, 1); + EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARBIN, ByteBuffer.wrap(metaBytes), + new EncodeNode[]{offsetsNode}, new int[]{0}); + return new EncodeResult(root, List.of(bytesBuf, offsetsBuf), statsMin, statsMax); + } + + /// Computes the serialised min/max string [ScalarValue] pair for a string array, skipping + /// `null` entries (lexicographic by [String#compareTo]). Returns `null` when every entry is + /// `null`. Shared so the dictionary zone-map path computes per-chunk string min/max identically + /// to the flat path. + /// + /// @param strings the string values (may contain `null`) + /// @return a two-element `{min, max}` array of encoded scalars, or `null` if all entries are `null` + public static byte[][] minMaxStats(String[] strings) { String minStr = null; String maxStr = null; for (String s : strings) { @@ -69,12 +87,9 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { maxStr = s; } } - byte[] statsMin = minStr != null ? ScalarValue.ofStringValue(minStr).encode() : null; - byte[] statsMax = maxStr != null ? ScalarValue.ofStringValue(maxStr).encode() : null; - - EncodeNode offsetsNode = EncodeNode.leaf(EncodingId.VORTEX_PRIMITIVE, 1); - EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARBIN, ByteBuffer.wrap(metaBytes), - new EncodeNode[]{offsetsNode}, new int[]{0}); - return new EncodeResult(root, List.of(bytesBuf, offsetsBuf), statsMin, statsMax); + if (minStr == null) { + return null; + } + return new byte[][]{ScalarValue.ofStringValue(minStr).encode(), ScalarValue.ofStringValue(maxStr).encode()}; } } diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java index 0d59b55f..01f1f293 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java @@ -8,6 +8,7 @@ import io.github.dfa1.vortex.reader.array.LongArray; import io.github.dfa1.vortex.reader.array.MaskedArray; import io.github.dfa1.vortex.reader.array.StructArray; +import io.github.dfa1.vortex.reader.array.VarBinArray; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; @@ -219,47 +220,85 @@ void chunkWithoutStats_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOExc } @Test - void nonPrimitiveColumn_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException { - // Given a nullable Utf8 column (no min/max stats yet) across two zones of two rows: - // zone 0 = ["a", null], zone 1 = [null, null] + void utf8Column_emitsStringMinMaxZoneMap(@TempDir Path tmp) throws IOException { + // Given a Utf8 column across two zones of two rows: zone 0 = ["apple", "banana"], + // zone 1 = ["cherry", "date"]. vortex.varbin records full string min/max per chunk, so + // the zone-map carries MAX+MIN+NULL_COUNT (string min/max), not null_count alone. DType.Struct schema = new DType.Struct( - List.of("s"), List.of(new DType.Utf8(true)), false); + List.of("s"), List.of(new DType.Utf8(false)), false); WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false); Path file = tmp.resolve("utf8.vtx"); try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); var sut = VortexWriter.create(ch, schema, opts)) { - sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData( - new String[]{"a", ""}, new boolean[]{true, false}))); - sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData( - new String[]{"", ""}, new boolean[]{false, false}))); + sut.writeChunk(Map.of("s", new String[]{"apple", "banana"})); + sut.writeChunk(Map.of("s", new String[]{"cherry", "date"})); } - // When the NULL_COUNT-only stats table is decoded + // When / Then the column is zoned with MAX+MIN+NULL_COUNT and string min/max decode per zone try (VortexReader reader = VortexReader.open(file)) { Layout column = reader.layout().children().get(0); assertThat(column.isZoned()).isTrue(); ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); - assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); // NULL_COUNT only + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); // MAX+MIN+NULL_COUNT Layout zonesFlat = column.children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); - DType.Struct statsDtype = new DType.Struct( - List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false); try (Arena arena = Arena.ofConfined()) { - // A single-field stats table decodes to the bare (masked) field, not a StructArray. - MaskedArray stats = (MaskedArray) reader.decodeFlatSegment(spec, statsDtype, 2, arena); - LongArray nullCount = (LongArray) stats.inner(); - assertThat(nullCount.getLong(0)).isEqualTo(1); - assertThat(nullCount.getLong(1)).isEqualTo(2); + StructArray stats = + (StructArray) reader.decodeFlatSegment(spec, utf8StatsTableDtype(), 2, arena); + VarBinArray max = (VarBinArray) ((MaskedArray) stats.field("max")).inner(); + VarBinArray min = (VarBinArray) ((MaskedArray) stats.field("min")).inner(); + assertThat(min.getString(0)).isEqualTo("apple"); + assertThat(max.getString(0)).isEqualTo("banana"); + assertThat(min.getString(1)).isEqualTo("cherry"); + assertThat(max.getString(1)).isEqualTo("date"); + } + } + } + + @Test + void extensionColumn_emitsStorageMinMaxZoneMap(@TempDir Path tmp) throws IOException { + // Given an extension column over I64 storage written as a raw long[] (no spec auto-route), + // two zones of two rows: zone 0 = [10, 11], zone 1 = [20, 21]. ExtEncoding propagates the + // storage primitive's min/max, so the zone-map carries MAX+MIN+NULL_COUNT — same as I64. + DType ext = new DType.Extension( + "test.ext", new DType.Primitive(PType.I64, false), null, false); + DType.Struct schema = new DType.Struct(List.of("t"), List.of(ext), false); + WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false); + Path file = tmp.resolve("ext.vtx"); + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, opts)) { + sut.writeChunk(Map.of("t", new long[]{10, 11})); + sut.writeChunk(Map.of("t", new long[]{20, 21})); + } + + // When / Then the column is zoned with MAX+MIN+NULL_COUNT and the stats decode per zone + try (VortexReader reader = VortexReader.open(file)) { + Layout column = reader.layout().children().get(0); + assertThat(column.isZoned()).isTrue(); + ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); // MAX+MIN+NULL_COUNT + + Layout zonesFlat = column.children().get(1); + SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); + try (Arena arena = Arena.ofConfined()) { + StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 2, arena); + LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); + LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); + assertThat(min.getLong(0)).isEqualTo(10); + assertThat(max.getLong(0)).isEqualTo(11); + assertThat(min.getLong(1)).isEqualTo(20); + assertThat(max.getLong(1)).isEqualTo(21); } } } @Test - void dictColumn_emitsNullCountZoneMapWrappingDict(@TempDir Path tmp) throws IOException { + void dictColumn_emitsStringMinMaxZoneMapWrappingDict(@TempDir Path tmp) throws IOException { // Given a low-cardinality Utf8 column across 2 chunks of 6 with heavy repeats (so the - // global-dict candidate test, distinct*2 < rows, fires) → vortex.dict; with zone maps the - // column is vortex.stats wrapping the dict, carrying NULL_COUNT only. + // global-dict candidate test, distinct*2 < rows, fires) → vortex.dict. Zone-map stats are + // computed on the logical Utf8 values, independent of the dict encoding: zone 0 = a..b, + // zone 1 = a..c → MAX+MIN+NULL_COUNT, with the column wrapped as vortex.stats over the dict. DType.Struct schema = new DType.Struct( List.of("s"), List.of(new DType.Utf8(false)), false); WriteOptions opts = new WriteOptions(6, true, 0.90, 0, true, false); @@ -270,13 +309,62 @@ void dictColumn_emitsNullCountZoneMapWrappingDict(@TempDir Path tmp) throws IOEx sut.writeChunk(Map.of("s", new String[]{"c", "c", "c", "a", "a", "a"})); } - // When / Then — zoned over a dict data child, NULL_COUNT-only bitset + // When / Then — zoned over a dict data child, MAX+MIN+NULL_COUNT, string min/max per zone try (VortexReader reader = VortexReader.open(file)) { Layout column = reader.layout().children().get(0); assertThat(column.isZoned()).isTrue(); assertThat(column.children().get(0).isDict()).isTrue(); ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); - assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); // MAX+MIN+NULL_COUNT + + Layout zonesFlat = column.children().get(1); + SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); + try (Arena arena = Arena.ofConfined()) { + StructArray stats = + (StructArray) reader.decodeFlatSegment(spec, utf8StatsTableDtype(), 2, arena); + VarBinArray max = (VarBinArray) ((MaskedArray) stats.field("max")).inner(); + VarBinArray min = (VarBinArray) ((MaskedArray) stats.field("min")).inner(); + assertThat(min.getString(0)).isEqualTo("a"); + assertThat(max.getString(0)).isEqualTo("b"); + assertThat(min.getString(1)).isEqualTo("a"); + assertThat(max.getString(1)).isEqualTo("c"); + } + } + } + + @Test + void primitiveDictColumn_emitsNumericMinMaxZoneMapWrappingDict(@TempDir Path tmp) throws IOException { + // Given a low-cardinality I64 column across 2 chunks of 6 with heavy repeats → vortex.dict. + // Zone-map min/max are computed on the logical I64 values: zone 0 = 1..2, zone 1 = 1..3. + DType.Struct schema = new DType.Struct( + List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false); + WriteOptions opts = new WriteOptions(6, true, 0.90, 0, true, false); + Path file = tmp.resolve("primdict.vtx"); + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, opts)) { + sut.writeChunk(Map.of("v", new long[]{1, 1, 1, 2, 2, 2})); + sut.writeChunk(Map.of("v", new long[]{3, 3, 3, 1, 1, 1})); + } + + // When / Then — zoned over a dict child with MAX+MIN+NULL_COUNT, numeric min/max per zone + try (VortexReader reader = VortexReader.open(file)) { + Layout column = reader.layout().children().get(0); + assertThat(column.isZoned()).isTrue(); + assertThat(column.children().get(0).isDict()).isTrue(); + ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); + + Layout zonesFlat = column.children().get(1); + SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); + try (Arena arena = Arena.ofConfined()) { + StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 2, arena); + LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); + LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); + assertThat(min.getLong(0)).isEqualTo(1); + assertThat(max.getLong(0)).isEqualTo(2); + assertThat(min.getLong(1)).isEqualTo(1); + assertThat(max.getLong(1)).isEqualTo(3); + } } } @@ -302,4 +390,14 @@ private static DType.Struct statsTableDtype() { new DType.Primitive(PType.U64, true)), false); } + + /// Same as [#statsTableDtype()] but with Utf8 (string) MAX/MIN columns. + private static DType.Struct utf8StatsTableDtype() { + DType nullableUtf8 = new DType.Utf8(true); + return new DType.Struct( + List.of("max", "max_is_truncated", "min", "min_is_truncated", "null_count"), + List.of(nullableUtf8, new DType.Bool(false), nullableUtf8, new DType.Bool(false), + new DType.Primitive(PType.U64, true)), + false); + } }