From 14c3b973aca4f0495970b1e21f01d8c71700e44b Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sun, 21 Jun 2026 21:09:33 +0200 Subject: [PATCH] feat(writer): emit SUM zone-map stat for numeric primitive columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probed Rust real-file fixtures (tpch_lineitem): numeric primitive and decimal columns carry [MAX, MIN, SUM, NULL_COUNT]; Utf8/extension/date carry [MAX, MIN, NULL_COUNT] (no SUM, even when extension storage is numeric). Closes the SUM half of the Rust-parity stats increment (ADR 0013 §6 aggregate push-down). The writer now computes a per-chunk SUM over each column's logical values (PrimitiveEncodingEncoder.sumStat: signed -> i64, unsigned -> u64, float -> f64; checked i64/u64 overflow -> null zone) and emits it as the SUM(5) field in the zone-map stats table for plain numeric primitives only — flat and dict paths alike, carried on ChunkRef / DictColRef. Validity placeholders are zero (sum-neutral), so nullable columns sum correctly without excluding nulls. Decimal SUM is deferred with decimal min/max. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/vortex/writer/VortexWriter.java | 126 +++++++++++---- .../encode/PrimitiveEncodingEncoder.java | 145 ++++++++++++++++++ .../dfa1/vortex/writer/WriterZoneMapTest.java | 45 ++++-- 3 files changed, 279 insertions(+), 37 deletions(-) diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index 646e6aa9..dd571c24 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -81,9 +81,10 @@ public final class VortexWriter implements Closeable { private static final int LAYOUT_DICT = 3; private static final int LAYOUT_ZONED = 4; - // Stat ordinals in the Rust `Stat` enum (see ZonedStatsSchema). Emitted: MAX, MIN, NULL_COUNT. + // Stat ordinals in the Rust `Stat` enum (see ZonedStatsSchema). Emitted: MAX, MIN, SUM, NULL_COUNT. private static final int STAT_MAX = 3; private static final int STAT_MIN = 4; + private static final int STAT_SUM = 5; private static final int STAT_NULL_COUNT = 6; // Columns with global cardinality below this threshold are dict-encoded across all chunks. @@ -122,6 +123,7 @@ public final class VortexWriter implements Closeable { // Stats (ScalarValue bytes) of the most recently written segment, captured for ChunkRef. private byte[] lastStatsMin; private byte[] lastStatsMax; + private byte[] lastStatsSum; // Null count of the most recently written segment's input data (0 for dense arrays). private long lastNullCount; @@ -469,7 +471,7 @@ public void writeChunk(Map columns) throws IOException { } else { long rowCount = arrayLength(data); int segIdx = writeSegment(colDtype, data); - colChunks.get(colName).add(new ChunkRef(segIdx, rowCount, lastStatsMin, lastStatsMax, lastNullCount)); + colChunks.get(colName).add(new ChunkRef(segIdx, rowCount, lastStatsMin, lastStatsMax, lastStatsSum, lastNullCount)); } } firstChunkSeen = true; @@ -572,6 +574,7 @@ private int writeSegment(DType dtype, Object data, EncodingEncoder encodingOverr segs.add(new SegRef(offset, bytesWritten - offset)); lastStatsMin = result.statsMin(); lastStatsMax = result.statsMax(); + lastStatsSum = columnSum(dtype, data); lastNullCount = segNullCount; return segIdx; } @@ -701,8 +704,10 @@ private void flushZoneMaps() throws IOException { if (chunks.isEmpty()) { continue; } - DType minMaxDtype = zoneMinMaxDtype(columnDtype(colName)); + DType colDtype = columnDtype(colName); + DType minMaxDtype = zoneMinMaxDtype(colDtype); boolean hasMinMax = minMaxDtype != null && chunks.stream().allMatch(ChunkRef::hasStats); + DType sumDtype = zoneSumDtype(colDtype); long[] nullCounts = new long[chunks.size()]; for (int i = 0; i < chunks.size(); i++) { nullCounts[i] = chunks.get(i).nullCount(); @@ -710,20 +715,23 @@ private void flushZoneMaps() throws IOException { emitZoneMap(colName, hasMinMax ? minMaxDtype : null, chunks.stream().map(ChunkRef::statsMin).toList(), chunks.stream().map(ChunkRef::statsMax).toList(), + sumDtype, chunks.stream().map(ChunkRef::statsSum).toList(), nullCounts); } - // Dict-encoded columns (one zone per code chunk). MIN/MAX come from each chunk's logical + // Dict-encoded columns (one zone per code chunk). MIN/MAX/SUM come from each chunk's logical // values (computed at dict-build time); NULL_COUNT always. Matches Rust, whose zone-map // stats are computed on the logical column dtype, independent of the dict encoding. for (Map.Entry e : dictColRefs.entrySet()) { DictColRef ref = e.getValue(); - DType minMaxDtype = zoneMinMaxDtype(columnDtype(e.getKey())); + DType colDtype = columnDtype(e.getKey()); + DType minMaxDtype = zoneMinMaxDtype(colDtype); boolean hasMinMax = minMaxDtype != null && ref.chunkStatsMin().stream().allMatch(java.util.Objects::nonNull) && ref.chunkStatsMax().stream().allMatch(java.util.Objects::nonNull); long[] nullCounts = ref.chunkNullCounts().stream().mapToLong(Long::longValue).toArray(); emitZoneMap(e.getKey(), hasMinMax ? minMaxDtype : null, - ref.chunkStatsMin(), ref.chunkStatsMax(), nullCounts); + ref.chunkStatsMin(), ref.chunkStatsMax(), + zoneSumDtype(colDtype), ref.chunkStatsSum(), nullCounts); } } @@ -731,12 +739,14 @@ private DType columnDtype(String colName) { return schema.fieldTypes().get(schema.fieldNames().indexOf(colName)); } - /// Writes one `vortex.stats` zone-map for `colName`: one zone per chunk, with NULL_COUNT always - /// and MAX/MIN (plus always-false `_is_truncated` flags) when `minMaxDtype` is non-null. - /// `minBytes`/`maxBytes` hold each zone's serialised min/max scalar — read only when - /// `minMaxDtype` is set. Field/bit order follows ZonedStatsSchema: MAX(3), MIN(4), NULL_COUNT(6). - private void emitZoneMap(String colName, DType minMaxDtype, - List minBytes, List maxBytes, long[] nullCounts) throws IOException { + /// Writes one `vortex.stats` zone-map for `colName`: one zone per chunk, with NULL_COUNT always, + /// MAX/MIN (plus always-false `_is_truncated` flags) when `minMaxDtype` is non-null, and SUM when + /// `sumDtype` is non-null. `minBytes`/`maxBytes`/`sumBytes` hold each zone's serialised scalar — + /// read only when the matching dtype is set; a `null` `sumBytes` entry marks an overflowed zone + /// (recorded as a null sum). Field/bit order follows ZonedStatsSchema: MAX(3), MIN(4), SUM(5), + /// NULL_COUNT(6). + private void emitZoneMap(String colName, DType minMaxDtype, List minBytes, List maxBytes, + DType sumDtype, List sumBytes, long[] nullCounts) throws IOException { int nZones = nullCounts.length; boolean[] allValid = new boolean[nZones]; java.util.Arrays.fill(allValid, true); @@ -759,13 +769,21 @@ private void emitZoneMap(String colName, DType minMaxDtype, types.add(new DType.Bool(false)); fields.add(notTruncated.clone()); } + if (sumDtype != null) { + boolean[] sumValid = new boolean[nZones]; + Object sumArr = sumColumn(sumDtype, sumBytes, sumValid); + names.add("sum"); + types.add(sumDtype); + fields.add(new NullableData(sumArr, sumValid)); + } names.add("null_count"); types.add(new DType.Primitive(PType.U64, true)); fields.add(new NullableData(nullCounts, allValid.clone())); DType.Struct statsDtype = new DType.Struct(List.copyOf(names), List.copyOf(types), false); int zonesSegIdx = writeSegment(statsDtype, new StructData(fields), new StructEncodingEncoder()); - zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), minMaxDtype != null)); + zoneMaps.put(colName, + new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), minMaxDtype != null, sumDtype != null)); } /// Wraps a column's data layout in a `vortex.stats` (zoned) layout when a zone-map was @@ -778,20 +796,23 @@ private int wrapZoneMap(FlatBufferBuilder fbb, String colName, int dataLayout, l int zonesSegV = Layout.createSegmentsVector(fbb, new long[]{zm.zonesSegIdx()}); int zonesFlat = Layout.createLayout(fbb, LAYOUT_FLAT, zm.nZones(), 0, 0, zonesSegV); int childV = Layout.createChildrenVector(fbb, new int[]{dataLayout, zonesFlat}); - int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen(), zm.hasMinMax())); + int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen(), zm.hasMinMax(), zm.hasSum())); return Layout.createLayout(fbb, LAYOUT_ZONED, colRows, metaV, childV, 0); } /// `vortex.stats` metadata: `u32` zone length (LE) + a 1-byte stat bitset (LSB-first) with the - /// NULL_COUNT bit always set and the MAX/MIN bits set when present, matching + /// NULL_COUNT bit always set and the MAX/MIN and SUM bits set when present, matching /// [io.github.dfa1.vortex.inspect] `ZonedStatsSchema`. - private static byte[] zonedMetadataBytes(long zoneLen, boolean hasMinMax) { + private static byte[] zonedMetadataBytes(long zoneLen, boolean hasMinMax, boolean hasSum) { byte[] meta = new byte[5]; ByteBuffer.wrap(meta).order(ByteOrder.LITTLE_ENDIAN).putInt((int) zoneLen); int bits = 1 << STAT_NULL_COUNT; if (hasMinMax) { bits |= (1 << STAT_MAX) | (1 << STAT_MIN); } + if (hasSum) { + bits |= (1 << STAT_SUM); + } meta[4] = (byte) bits; return meta; } @@ -820,6 +841,32 @@ private static DType zoneMinMaxDtype(DType dtype) { }; } + /// The (nullable) dtype a zone-map stores SUM in for `dtype`, or `null` when the column has no + /// recordable sum. Only plain numeric primitives are summed — signed → `i64`, unsigned → `u64`, + /// float → `f64` — matching Rust, which emits SUM for primitives and decimals but not for + /// Utf8/extension/date columns even when their storage is numeric. + private static DType zoneSumDtype(DType dtype) { + if (!(dtype instanceof DType.Primitive p)) { + return null; + } + return switch (p.ptype()) { + case U8, U16, U32, U64 -> new DType.Primitive(PType.U64, true); + case I8, I16, I32, I64 -> new DType.Primitive(PType.I64, true); + case F16, F32, F64 -> new DType.Primitive(PType.F64, true); + }; + } + + /// The serialised per-chunk SUM scalar for `data` of logical type `dtype`, or `null` when the + /// column is not summable (non-primitive) or the sum overflowed. Validity placeholders are zero + /// and therefore sum-neutral, so a nullable carrier sums correctly via its values. + private static byte[] columnSum(DType dtype, Object data) { + if (!(dtype instanceof DType.Primitive p)) { + return null; + } + Object values = data instanceof NullableData nd ? nd.values() : data; + return PrimitiveEncodingEncoder.sumStat(p.ptype(), values); + } + /// Builds the per-zone min (or max) values array for the resolved min/max `dtype`, decoding each /// zone's serialised [ScalarValue] stat into the array shape its encoder expects. private static Object zoneStatValues(DType minMaxDtype, List statBytes) throws IOException { @@ -830,6 +877,28 @@ private static Object zoneStatValues(DType minMaxDtype, List statBytes) }; } + /// Builds the per-zone SUM array for `sumDtype` (i64/u64 → `long[]`, f64 → `double[]`), decoding + /// each zone's serialised scalar. Zones whose sum overflowed carry a `null` entry in `sumBytes`; + /// `valid[i]` is set accordingly so the stat field reports them as null. + private static Object sumColumn(DType sumDtype, List sumBytes, boolean[] valid) throws IOException { + PType ptype = ((DType.Primitive) sumDtype).ptype(); + int n = sumBytes.size(); + if (ptype == PType.F64) { + double[] a = new double[n]; + for (int i = 0; i < n; i++) { + valid[i] = sumBytes.get(i) != null; + a[i] = valid[i] ? scalarDouble(sumBytes.get(i)) : 0.0; + } + return a; + } + long[] a = new long[n]; + for (int i = 0; i < n; i++) { + valid[i] = sumBytes.get(i) != null; + a[i] = valid[i] ? scalarLong(sumBytes.get(i)) : 0L; + } + return a; + } + /// Builds the per-zone string array by decoding each zone's serialised string [ScalarValue] /// stat. Used for Utf8 columns whose `vortex.varbin` encoder records full string min/max scalars. private static String[] statStringColumn(List statBytes) throws IOException { @@ -1078,7 +1147,7 @@ private void writeGlobalDictColumn(String colName, DType.Primitive dtype, List chunkNullCounts = new ArrayList<>(); List chunkStatsMin = new ArrayList<>(); List chunkStatsMax = new ArrayList<>(); + List chunkStatsSum = new ArrayList<>(); for (Object chunk : chunks) { int len = primitiveArrayLen(chunk, ptype); Object codesArr = buildCodesArray(chunk, ptype, valueMap, codePType, len); codesSegIdxes.add(writeSegment(codesDtype, codesArr)); chunkRowCounts.add((long) len); chunkNullCounts.add(chunk instanceof NullableData nd ? countNulls(nd.validity()) : 0L); - // Per-zone min/max over the chunk's logical values (matches the flat primitive path: - // computed on nd.values(), placeholders included). Lets the dict zone-map prune like a - // plain primitive column. + // Per-zone min/max + sum over the chunk's logical values (matches the flat primitive + // path: computed on nd.values(), placeholders included). Lets the dict zone-map prune + // and aggregate like a plain primitive column. Object values = chunk instanceof NullableData nd ? nd.values() : chunk; byte[][] mm = PrimitiveEncodingEncoder.minMaxStats(ptype, values); chunkStatsMin.add(mm != null ? mm[0] : null); chunkStatsMax.add(mm != null ? mm[1] : null); + chunkStatsSum.add(PrimitiveEncodingEncoder.sumStat(ptype, values)); } dictColRefs.put(colName, new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, - chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax)); + chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax, chunkStatsSum)); } private void writeGlobalDictUtf8Column(String colName, DType.Utf8 dtype, List chunks) @@ -1134,7 +1205,7 @@ private void writeGlobalDictUtf8Column(String colName, DType.Utf8 dtype, List noSum = java.util.Collections.nCopies(codesSegIdxes.size(), null); dictColRefs.put(colName, new DictColRef(valuesSegIdx, dictSize, codesSegIdxes, - chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax)); + chunkRowCounts, chunkNullCounts, chunkStatsMin, chunkStatsMax, noSum)); } private static Object buildUtf8CodesArray(String[] strs, Map valueMap, PType codePType) { @@ -1359,7 +1432,8 @@ private static Object buildCodesArray(Object data, PType ptype, Map codesSegIdxes, List chunkRowCounts, List chunkNullCounts, - List chunkStatsMin, List chunkStatsMax) { + List chunkStatsMin, List chunkStatsMax, List chunkStatsSum) { long totalRows() { return chunkRowCounts.stream().mapToLong(Long::longValue).sum(); } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java index 58b29ea1..febd87cb 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/PrimitiveEncodingEncoder.java @@ -290,6 +290,151 @@ public static byte[][] minMaxStats(PType ptype, Object data) { }; } + /// Computes the serialised SUM [io.github.dfa1.vortex.proto.ScalarValue] for a raw primitive + /// array, in the widened shape Rust uses for zone-map sums: signed ints → `i64`, unsigned ints + /// → `u64`, floats → `f64`. Returns `null` on integer overflow (Rust drops the zone's sum) and + /// for an empty array. Floats never overflow to `null` (they saturate to infinity). + /// + /// Nulls need not be excluded by the caller: validity placeholders are zero, which is + /// sum-neutral — matching the per-segment min/max convention. + /// + /// @param ptype the primitive type of `data` + /// @param data the raw primitive array + /// @return the encoded sum scalar, or `null` on overflow or empty input + public static byte[] sumStat(PType ptype, Object data) { + return switch (ptype) { + case I8 -> { + byte[] a = (byte[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (byte v : a) { + s += v; + } + yield scalarI64(s); + } + case I16 -> { + short[] a = (short[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (short v : a) { + s += v; + } + yield scalarI64(s); + } + case I32 -> { + int[] a = (int[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (int v : a) { + s += v; + } + yield scalarI64(s); + } + case I64 -> { + long[] a = (long[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (long v : a) { + try { + s = Math.addExact(s, v); + } catch (ArithmeticException overflow) { + yield null; + } + } + yield scalarI64(s); + } + case U8 -> { + byte[] a = (byte[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (byte v : a) { + s += Byte.toUnsignedLong(v); + } + yield scalarU64(s); + } + case U16 -> { + short[] a = (short[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (short v : a) { + s += Short.toUnsignedLong(v); + } + yield scalarU64(s); + } + case U32 -> { + int[] a = (int[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (int v : a) { + s += Integer.toUnsignedLong(v); + } + yield scalarU64(s); + } + case U64 -> { + long[] a = (long[]) data; + if (a.length == 0) { + yield null; + } + long s = 0; + for (long v : a) { + long next = s + v; + if (Long.compareUnsigned(next, s) < 0) { + yield null; + } + s = next; + } + yield scalarU64(s); + } + case F32 -> { + float[] a = (float[]) data; + if (a.length == 0) { + yield null; + } + double s = 0; + for (float v : a) { + s += v; + } + yield scalarF64(s); + } + case F64 -> { + double[] a = (double[]) data; + if (a.length == 0) { + yield null; + } + double s = 0; + for (double v : a) { + s += v; + } + yield scalarF64(s); + } + case F16 -> { + short[] a = (short[]) data; + if (a.length == 0) { + yield null; + } + double s = 0; + for (short v : a) { + s += Float.float16ToFloat(v); + } + yield scalarF64(s); + } + }; + } + private static byte[] scalarI64(long v) { return ScalarValue.ofInt64Value(v).encode(); } diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java index 01f1f293..fd9a41a6 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java @@ -63,7 +63,7 @@ void enableZoneMaps_wrapsColumnInZonedLayoutWithMetadata(@TempDir Path tmp) thro assertThat(column.children()).hasSize(2); ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); assertThat(meta.getInt(meta.position())).isEqualTo(4); // zone_len - assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); // bits 3(MAX)+4(MIN)+6(NULL_COUNT) + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x78); // bits 3(MAX)+4(MIN)+5(SUM)+6(NULL_COUNT) } } @@ -103,12 +103,13 @@ void zoneMaps_statsPayloadDecodesPerZoneMinMax(@TempDir Path tmp) throws IOExcep Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 3, arena); + StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 3, arena); LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); + LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); LongArray nullCount = (LongArray) ((MaskedArray) stats.field("null_count")).inner(); - // Then min/max per zone match the source data; the column is non-nullable so + // Then min/max/sum per zone match the source data; the column is non-nullable so // every zone reports zero nulls assertThat(min.getLong(0)).isZero(); assertThat(max.getLong(0)).isEqualTo(3); @@ -116,6 +117,9 @@ void zoneMaps_statsPayloadDecodesPerZoneMinMax(@TempDir Path tmp) throws IOExcep assertThat(max.getLong(1)).isEqualTo(7); assertThat(min.getLong(2)).isEqualTo(8); assertThat(max.getLong(2)).isEqualTo(11); + assertThat(sum.getLong(0)).isEqualTo(0 + 1 + 2 + 3); + assertThat(sum.getLong(1)).isEqualTo(4 + 5 + 6 + 7); + assertThat(sum.getLong(2)).isEqualTo(8 + 9 + 10 + 11); assertThat(nullCount.getLong(0)).isZero(); assertThat(nullCount.getLong(1)).isZero(); assertThat(nullCount.getLong(2)).isZero(); @@ -144,12 +148,16 @@ void zoneMaps_nullableColumn_recordsPerZoneNullCount(@TempDir Path tmp) throws I Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 2, arena); + StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 2, arena); LongArray nullCount = (LongArray) ((MaskedArray) stats.field("null_count")).inner(); + LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); - // Then each zone's null count is recorded (1 and 2) + // Then each zone's null count is recorded (1 and 2). Sum skips nulls (zero + // placeholders are sum-neutral): zone 0 = 10, zone 1 = 0 assertThat(nullCount.getLong(0)).isEqualTo(1); assertThat(nullCount.getLong(1)).isEqualTo(2); + assertThat(sum.getLong(0)).isEqualTo(10); + assertThat(sum.getLong(1)).isZero(); } } } @@ -199,7 +207,8 @@ void noChunks_emitsNoZoneMap(@TempDir Path tmp) throws IOException { @Test void chunkWithoutStats_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException { // Given a column with one normal chunk and one empty chunk (no min/max stats): MIN/MAX is - // dropped, but NULL_COUNT is still emitted — the zone-map carries the NULL_COUNT bit only. + // dropped (it requires every chunk to carry stats), but NULL_COUNT and SUM are still emitted + // — SUM is independent (the empty zone's sum is simply null). DType.Struct schema = new DType.Struct( List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false); WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false); @@ -210,12 +219,12 @@ void chunkWithoutStats_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOExc sut.writeChunk(Map.of("v", new long[]{})); } - // When / Then — zoned with the NULL_COUNT-only bitset (bit 6 = 0x40) + // When / Then — zoned with the SUM+NULL_COUNT bitset (bits 5+6 = 0x60), no MIN/MAX try (VortexReader reader = VortexReader.open(file)) { Layout column = reader.layout().children().get(0); assertThat(column.isZoned()).isTrue(); ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); - assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x60); } } @@ -352,18 +361,21 @@ void primitiveDictColumn_emitsNumericMinMaxZoneMapWrappingDict(@TempDir Path tmp assertThat(column.isZoned()).isTrue(); assertThat(column.children().get(0).isDict()).isTrue(); ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); - assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x58); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x78); Layout zonesFlat = column.children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 2, arena); + StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 2, arena); LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); + LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); assertThat(min.getLong(0)).isEqualTo(1); assertThat(max.getLong(0)).isEqualTo(2); assertThat(min.getLong(1)).isEqualTo(1); assertThat(max.getLong(1)).isEqualTo(3); + assertThat(sum.getLong(0)).isEqualTo(1 + 1 + 1 + 2 + 2 + 2); + assertThat(sum.getLong(1)).isEqualTo(3 + 3 + 3 + 1 + 1 + 1); } } } @@ -381,7 +393,8 @@ private static Object sample(PType ptype, int base) { }; } - /// Reconstructs the stats-table dtype the writer emits: MAX, MIN, NULL_COUNT for an I64 column. + /// Stats-table dtype for a column with MAX, MIN, NULL_COUNT but no SUM (e.g. an extension + /// column over I64 storage — Rust does not sum date/extension columns). private static DType.Struct statsTableDtype() { DType nullableI64 = new DType.Primitive(PType.I64, true); return new DType.Struct( @@ -391,6 +404,16 @@ private static DType.Struct statsTableDtype() { false); } + /// Stats-table dtype for a numeric I64 column: MAX, MIN, SUM (nullable I64), NULL_COUNT. + private static DType.Struct numericStatsTableDtype() { + DType nullableI64 = new DType.Primitive(PType.I64, true); + return new DType.Struct( + List.of("max", "max_is_truncated", "min", "min_is_truncated", "sum", "null_count"), + List.of(nullableI64, new DType.Bool(false), nullableI64, new DType.Bool(false), + nullableI64, new DType.Primitive(PType.U64, true)), + false); + } + /// Same as [#statsTableDtype()] but with Utf8 (string) MAX/MIN columns. private static DType.Struct utf8StatsTableDtype() { DType nullableUtf8 = new DType.Utf8(true);