diff --git a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/InspectorRender.java b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/InspectorRender.java index 23835f4d..efe66ff6 100644 --- a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/InspectorRender.java +++ b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/InspectorRender.java @@ -73,6 +73,18 @@ static String formatValue(Array array, int i, DType declared) { static List formatStatsArray(Array arr, DType.Struct statsDtype) { Array unwrapped = arr instanceof MaskedArray m ? m.inner() : arr; if (!(unwrapped instanceof StructArray sa)) { + // A single-field stats table (e.g. NULL_COUNT only) is decoded to the bare field, not + // a StructArray (the struct decoder collapses one-field structs). Render that one stat. + if (statsDtype.fieldTypes().size() == 1) { + String name = statsDtype.fieldNames().getFirst(); + DType fdtype = statsDtype.fieldTypes().getFirst(); + int rowCount = (int) arr.length(); + List rows = new ArrayList<>(rowCount); + for (int row = 0; row < rowCount; row++) { + rows.add(name + "=" + formatStatsCell(arr, row, fdtype)); + } + return rows; + } throw new IllegalStateException( "stats array is not a struct: " + arr.getClass().getSimpleName()); } diff --git a/cli/src/test/java/io/github/dfa1/vortex/cli/tui/InspectorRenderTest.java b/cli/src/test/java/io/github/dfa1/vortex/cli/tui/InspectorRenderTest.java index 69b2374e..1a1067a4 100644 --- a/cli/src/test/java/io/github/dfa1/vortex/cli/tui/InspectorRenderTest.java +++ b/cli/src/test/java/io/github/dfa1/vortex/cli/tui/InspectorRenderTest.java @@ -184,10 +184,10 @@ void maskedInvalidStatsCellRendersNull() { } @Test - void nonStructStatsArrayThrows() { + void multiFieldNonStructStatsArrayThrows() { try (Arena arena = Arena.ofConfined()) { - // Given — stats payload that is not a struct - DType.Struct statsDtype = new DType.Struct(List.of("min"), List.of(I64), false); + // Given — a multi-field stats schema but a non-struct payload + DType.Struct statsDtype = new DType.Struct(List.of("min", "max"), List.of(I64, I64), false); Array notAStruct = ArrayFixtures.longs(arena, 1L); // When / Then @@ -196,6 +196,23 @@ void nonStructStatsArrayThrows() { .hasMessageContaining("not a struct"); } } + + @Test + void singleFieldStatsArrayRendersBareField() { + try (Arena arena = Arena.ofConfined()) { + // Given — a single-field (NULL_COUNT-only) stats table decodes to the bare field, + // not a StructArray; the renderer must still render it + DType.Struct statsDtype = new DType.Struct( + List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false); + Array oneStat = ArrayFixtures.longs(arena, 0L, 2L); + + // When + List rows = InspectorRender.formatStatsArray(oneStat, statsDtype); + + // Then one row per zone, each showing the stat + assertThat(rows).containsExactly("null_count=0", "null_count=2"); + } + } } @Nested diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index 031a4e5b..c9d8a92e 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -691,38 +691,48 @@ private void flushZoneMaps() throws IOException { if (chunks.isEmpty()) { continue; } - DType colDtype = schema.fieldTypes().get(schema.fieldNames().indexOf(colName)); - if (!(colDtype instanceof DType.Primitive prim)) { - continue; - } - if (!chunks.stream().allMatch(ChunkRef::hasStats)) { - continue; - } int nZones = chunks.size(); boolean[] allValid = new boolean[nZones]; java.util.Arrays.fill(allValid, true); - boolean[] notTruncated = new boolean[nZones]; - DType nullablePrim = new DType.Primitive(prim.ptype(), true); + + // NULL_COUNT is computable for every column type; MIN/MAX only for fixed-width + // primitives whose chunks all carry stats. Field/bit order follows + // ZonedStatsSchema: MAX(3), MIN(4), NULL_COUNT(6); each stat field is nullable. + DType colDtype = schema.fieldTypes().get(schema.fieldNames().indexOf(colName)); + boolean hasMinMax = colDtype instanceof DType.Primitive + && chunks.stream().allMatch(ChunkRef::hasStats); + + List names = new java.util.ArrayList<>(); + List types = new java.util.ArrayList<>(); + List fields = new java.util.ArrayList<>(); + if (hasMinMax) { + PType ptype = ((DType.Primitive) colDtype).ptype(); + DType nullablePrim = new DType.Primitive(ptype, true); + boolean[] notTruncated = new boolean[nZones]; + names.add("max"); + types.add(nullablePrim); + fields.add(new NullableData(statColumn(ptype, chunks, true), allValid.clone())); + names.add("max_is_truncated"); + types.add(new DType.Bool(false)); + fields.add(notTruncated); + names.add("min"); + types.add(nullablePrim); + fields.add(new NullableData(statColumn(ptype, chunks, false), allValid.clone())); + names.add("min_is_truncated"); + types.add(new DType.Bool(false)); + fields.add(notTruncated.clone()); + } long[] nullCounts = new long[nZones]; for (int i = 0; i < nZones; i++) { nullCounts[i] = chunks.get(i).nullCount(); } - // Field order mirrors ZonedStatsSchema.statsTableDtype for present stats MAX(3), MIN(4), - // NULL_COUNT(6): [max, max_is_truncated, min, min_is_truncated, null_count]. Every stat - // field is nullable in the reconstructed dtype, so null_count is a nullable U64. - DType.Struct statsDtype = new DType.Struct( - List.of("max", "max_is_truncated", "min", "min_is_truncated", "null_count"), - List.of(nullablePrim, new DType.Bool(false), nullablePrim, new DType.Bool(false), - new DType.Primitive(PType.U64, true)), - false); - StructData sd = new StructData(List.of( - new NullableData(statColumn(prim.ptype(), chunks, true), allValid), - notTruncated, - new NullableData(statColumn(prim.ptype(), chunks, false), allValid.clone()), - notTruncated.clone(), - new NullableData(nullCounts, allValid.clone()))); - int zonesSegIdx = writeSegment(statsDtype, sd, new StructEncodingEncoder()); - zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize())); + names.add("null_count"); + types.add(new DType.Primitive(PType.U64, true)); + fields.add(new NullableData(nullCounts, allValid.clone())); + + DType.Struct statsDtype = new DType.Struct(List.copyOf(names), List.copyOf(types), false); + int zonesSegIdx = writeSegment(statsDtype, new StructData(fields), new StructEncodingEncoder()); + zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), hasMinMax)); } } @@ -736,16 +746,21 @@ private int wrapZoneMap(FlatBufferBuilder fbb, String colName, int dataLayout, l int zonesSegV = Layout.createSegmentsVector(fbb, new long[]{zm.zonesSegIdx()}); int zonesFlat = Layout.createLayout(fbb, LAYOUT_FLAT, zm.nZones(), 0, 0, zonesSegV); int childV = Layout.createChildrenVector(fbb, new int[]{dataLayout, zonesFlat}); - int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen())); + int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen(), zm.hasMinMax())); return Layout.createLayout(fbb, LAYOUT_ZONED, colRows, metaV, childV, 0); } - /// `vortex.stats` metadata: `u32` zone length (LE) + a 1-byte stat bitset with the MAX and - /// MIN bits set (LSB-first), matching [io.github.dfa1.vortex.inspect] `ZonedStatsSchema`. - private static byte[] zonedMetadataBytes(long zoneLen) { + /// `vortex.stats` metadata: `u32` zone length (LE) + a 1-byte stat bitset (LSB-first) with the + /// NULL_COUNT bit always set and the MAX/MIN bits set when present, matching + /// [io.github.dfa1.vortex.inspect] `ZonedStatsSchema`. + private static byte[] zonedMetadataBytes(long zoneLen, boolean hasMinMax) { byte[] meta = new byte[5]; ByteBuffer.wrap(meta).order(ByteOrder.LITTLE_ENDIAN).putInt((int) zoneLen); - meta[4] = (byte) ((1 << STAT_MAX) | (1 << STAT_MIN) | (1 << STAT_NULL_COUNT)); + int bits = 1 << STAT_NULL_COUNT; + if (hasMinMax) { + bits |= (1 << STAT_MAX) | (1 << STAT_MIN); + } + meta[4] = (byte) bits; return meta; } @@ -1262,8 +1277,9 @@ boolean hasStats() { } /// Per-column zone-map: the flat segment holding the per-zone stats table, the zone - /// count (one zone per chunk), and the logical rows per zone. - private record ZoneMapRef(int zonesSegIdx, long nZones, long zoneLen) { + /// count (one zone per chunk), the logical rows per zone, and whether the table carries + /// MIN/MAX (else NULL_COUNT only). + private record ZoneMapRef(int zonesSegIdx, long nZones, long zoneLen, boolean hasMinMax) { } private record DictColRef(int valuesSegIdx, long valuesLen, List codesSegIdxes, diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java index d0a09d88..01a52533 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java @@ -196,9 +196,9 @@ void noChunks_emitsNoZoneMap(@TempDir Path tmp) throws IOException { } @Test - void chunkWithoutStats_skipsZoneMap(@TempDir Path tmp) throws IOException { - // Given a column with one normal chunk and one empty chunk (no min/max stats): not every - // chunk carries stats, so flushZoneMaps skips the column (the all-stats guard). + void chunkWithoutStats_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException { + // Given a column with one normal chunk and one empty chunk (no min/max stats): MIN/MAX is + // dropped, but NULL_COUNT is still emitted — the zone-map carries the NULL_COUNT bit only. DType.Struct schema = new DType.Struct( List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false); WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false); @@ -209,9 +209,49 @@ void chunkWithoutStats_skipsZoneMap(@TempDir Path tmp) throws IOException { sut.writeChunk(Map.of("v", new long[]{})); } - // When / Then + // When / Then — zoned with the NULL_COUNT-only bitset (bit 6 = 0x40) try (VortexReader reader = VortexReader.open(file)) { - assertThat(reader.layout().children().get(0).isZoned()).isFalse(); + Layout column = reader.layout().children().get(0); + assertThat(column.isZoned()).isTrue(); + ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); + } + } + + @Test + void nonPrimitiveColumn_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException { + // Given a nullable Utf8 column (no min/max stats yet) across two zones of two rows: + // zone 0 = ["a", null], zone 1 = [null, null] + DType.Struct schema = new DType.Struct( + List.of("s"), List.of(new DType.Utf8(true)), false); + WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false); + Path file = tmp.resolve("utf8.vtx"); + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, opts)) { + sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData( + new String[]{"a", ""}, new boolean[]{true, false}))); + sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData( + new String[]{"", ""}, new boolean[]{false, false}))); + } + + // When the NULL_COUNT-only stats table is decoded + try (VortexReader reader = VortexReader.open(file)) { + Layout column = reader.layout().children().get(0); + assertThat(column.isZoned()).isTrue(); + ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN); + assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); // NULL_COUNT only + + Layout zonesFlat = column.children().get(1); + SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); + DType.Struct statsDtype = new DType.Struct( + List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false); + try (Arena arena = Arena.ofConfined()) { + // A single-field stats table decodes to the bare (masked) field, not a StructArray. + MaskedArray stats = (MaskedArray) reader.decodeFlatSegment(spec, statsDtype, 2, arena); + LongArray nullCount = (LongArray) stats.inner(); + assertThat(nullCount.getLong(0)).isEqualTo(1); + assertThat(nullCount.getLong(1)).isEqualTo(2); + } } }