From d927a94d73d07af3e6a0eeb802c812a37189f099 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sun, 21 Jun 2026 15:52:06 +0200 Subject: [PATCH] feat(writer): add NULL_COUNT to vortex.stats zone-maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends zone-map emission with the per-zone null count (Stat ordinal 6, nullable U64), so the bitset is now MAX|MIN|NULL_COUNT (0x58) — matching what Rust fixtures carry ([MIN,MAX,NULL_COUNT]). Per-chunk null counts are captured at write time from NullableData validity (0 for dense arrays). Round-trip test covers nullable (per-zone counts 1,2) and non-nullable (all zero) columns; Rust JNI interop reads the null_count-bearing files. Co-Authored-By: Claude Opus 4.8 --- .../dfa1/vortex/writer/VortexWriter.java | 43 +++++++++---- .../dfa1/vortex/writer/WriterZoneMapTest.java | 60 +++++++++++++++---- 2 files changed, 82 insertions(+), 21 deletions(-) diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index ce72f45c..97751477 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -81,9 +81,10 @@ public final class VortexWriter implements Closeable { private static final int LAYOUT_DICT = 3; private static final int LAYOUT_ZONED = 4; - // Stat ordinals in the Rust `Stat` enum (see ZonedStatsSchema). v1 emits MAX + MIN only. + // Stat ordinals in the Rust `Stat` enum (see ZonedStatsSchema). Emitted: MAX, MIN, NULL_COUNT. private static final int STAT_MAX = 3; private static final int STAT_MIN = 4; + private static final int STAT_NULL_COUNT = 6; // Columns with global cardinality below this threshold are dict-encoded across all chunks. // Kept low: global dict hurts high-cardinality F64 columns (ALP codes beat U16 dict codes). @@ -121,6 +122,8 @@ public final class VortexWriter implements Closeable { // Stats (ScalarValue bytes) of the most recently written segment, captured for ChunkRef. private byte[] lastStatsMin; private byte[] lastStatsMax; + // Null count of the most recently written segment's input data (0 for dense arrays). + private long lastNullCount; private VortexWriter( WritableByteChannel channel, DType.Struct schema, WriteOptions options, List encodings @@ -466,7 +469,7 @@ public void writeChunk(Map columns) throws IOException { } else { long rowCount = arrayLength(data); int segIdx = writeSegment(colDtype, data); - colChunks.get(colName).add(new ChunkRef(segIdx, rowCount, lastStatsMin, lastStatsMax)); + colChunks.get(colName).add(new ChunkRef(segIdx, rowCount, lastStatsMin, lastStatsMax, lastNullCount)); } } firstChunkSeen = true; @@ -568,6 +571,7 @@ private int writeSegment(DType dtype, Object data, EncodingEncoder encodingOverr segs.add(new SegRef(offset, bytesWritten - offset)); lastStatsMin = result.statsMin(); lastStatsMax = result.statsMax(); + lastNullCount = data instanceof NullableData nd ? countNulls(nd.validity()) : 0L; return segIdx; } } @@ -699,17 +703,24 @@ private void flushZoneMaps() throws IOException { java.util.Arrays.fill(allValid, true); boolean[] notTruncated = new boolean[nZones]; DType nullablePrim = new DType.Primitive(prim.ptype(), true); - // Field order mirrors ZonedStatsSchema.statsTableDtype for present stats MAX(3), MIN(4): - // [max, max_is_truncated, min, min_is_truncated]. + long[] nullCounts = new long[nZones]; + for (int i = 0; i < nZones; i++) { + nullCounts[i] = chunks.get(i).nullCount(); + } + // Field order mirrors ZonedStatsSchema.statsTableDtype for present stats MAX(3), MIN(4), + // NULL_COUNT(6): [max, max_is_truncated, min, min_is_truncated, null_count]. Every stat + // field is nullable in the reconstructed dtype, so null_count is a nullable U64. DType.Struct statsDtype = new DType.Struct( - List.of("max", "max_is_truncated", "min", "min_is_truncated"), - List.of(nullablePrim, new DType.Bool(false), nullablePrim, new DType.Bool(false)), + List.of("max", "max_is_truncated", "min", "min_is_truncated", "null_count"), + List.of(nullablePrim, new DType.Bool(false), nullablePrim, new DType.Bool(false), + new DType.Primitive(PType.U64, true)), false); StructData sd = new StructData(List.of( new NullableData(statColumn(prim.ptype(), chunks, true), allValid), notTruncated, new NullableData(statColumn(prim.ptype(), chunks, false), allValid.clone()), - notTruncated.clone())); + notTruncated.clone(), + new NullableData(nullCounts, allValid.clone()))); int zonesSegIdx = writeSegment(statsDtype, sd, new StructEncodingEncoder()); zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize())); } @@ -734,10 +745,20 @@ private int wrapZoneMap(FlatBufferBuilder fbb, String colName, int dataLayout, l private static byte[] zonedMetadataBytes(long zoneLen) { byte[] meta = new byte[5]; ByteBuffer.wrap(meta).order(ByteOrder.LITTLE_ENDIAN).putInt((int) zoneLen); - meta[4] = (byte) ((1 << STAT_MAX) | (1 << STAT_MIN)); + meta[4] = (byte) ((1 << STAT_MAX) | (1 << STAT_MIN) | (1 << STAT_NULL_COUNT)); return meta; } + private static long countNulls(boolean[] validity) { + long nulls = 0; + for (boolean valid : validity) { + if (!valid) { + nulls++; + } + } + return nulls; + } + private static boolean isZoneMappable(PType ptype) { return switch (ptype) { case I8, I16, I32, I64, U8, U16, U32, U64, F32, F64 -> true; @@ -989,7 +1010,7 @@ private void writeGlobalDictColumn(String colName, DType.Primitive dtype, List