Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ static String formatValue(Array array, int i, DType declared) {
static List<String> formatStatsArray(Array arr, DType.Struct statsDtype) {
Array unwrapped = arr instanceof MaskedArray m ? m.inner() : arr;
if (!(unwrapped instanceof StructArray sa)) {
// A single-field stats table (e.g. NULL_COUNT only) is decoded to the bare field, not
// a StructArray (the struct decoder collapses one-field structs). Render that one stat.
if (statsDtype.fieldTypes().size() == 1) {
String name = statsDtype.fieldNames().getFirst();
DType fdtype = statsDtype.fieldTypes().getFirst();
int rowCount = (int) arr.length();
List<String> rows = new ArrayList<>(rowCount);
for (int row = 0; row < rowCount; row++) {
rows.add(name + "=" + formatStatsCell(arr, row, fdtype));
}
return rows;
}
throw new IllegalStateException(
"stats array is not a struct: " + arr.getClass().getSimpleName());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,10 @@ void maskedInvalidStatsCellRendersNull() {
}

@Test
void nonStructStatsArrayThrows() {
void multiFieldNonStructStatsArrayThrows() {
try (Arena arena = Arena.ofConfined()) {
// Given — stats payload that is not a struct
DType.Struct statsDtype = new DType.Struct(List.of("min"), List.of(I64), false);
// Given — a multi-field stats schema but a non-struct payload
DType.Struct statsDtype = new DType.Struct(List.of("min", "max"), List.of(I64, I64), false);
Array notAStruct = ArrayFixtures.longs(arena, 1L);

// When / Then
Expand All @@ -196,6 +196,23 @@ void nonStructStatsArrayThrows() {
.hasMessageContaining("not a struct");
}
}

@Test
void singleFieldStatsArrayRendersBareField() {
try (Arena arena = Arena.ofConfined()) {
// Given — a single-field (NULL_COUNT-only) stats table decodes to the bare field,
// not a StructArray; the renderer must still render it
DType.Struct statsDtype = new DType.Struct(
List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false);
Array oneStat = ArrayFixtures.longs(arena, 0L, 2L);

// When
List<String> rows = InspectorRender.formatStatsArray(oneStat, statsDtype);

// Then one row per zone, each showing the stat
assertThat(rows).containsExactly("null_count=0", "null_count=2");
}
}
}

@Nested
Expand Down
80 changes: 48 additions & 32 deletions writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -691,38 +691,48 @@ private void flushZoneMaps() throws IOException {
if (chunks.isEmpty()) {
continue;
}
DType colDtype = schema.fieldTypes().get(schema.fieldNames().indexOf(colName));
if (!(colDtype instanceof DType.Primitive prim)) {
continue;
}
if (!chunks.stream().allMatch(ChunkRef::hasStats)) {
continue;
}
int nZones = chunks.size();
boolean[] allValid = new boolean[nZones];
java.util.Arrays.fill(allValid, true);
boolean[] notTruncated = new boolean[nZones];
DType nullablePrim = new DType.Primitive(prim.ptype(), true);

// NULL_COUNT is computable for every column type; MIN/MAX only for fixed-width
// primitives whose chunks all carry stats. Field/bit order follows
// ZonedStatsSchema: MAX(3), MIN(4), NULL_COUNT(6); each stat field is nullable.
DType colDtype = schema.fieldTypes().get(schema.fieldNames().indexOf(colName));
boolean hasMinMax = colDtype instanceof DType.Primitive
&& chunks.stream().allMatch(ChunkRef::hasStats);

List<String> names = new java.util.ArrayList<>();
List<DType> types = new java.util.ArrayList<>();
List<Object> fields = new java.util.ArrayList<>();
if (hasMinMax) {
PType ptype = ((DType.Primitive) colDtype).ptype();
DType nullablePrim = new DType.Primitive(ptype, true);
boolean[] notTruncated = new boolean[nZones];
names.add("max");
types.add(nullablePrim);
fields.add(new NullableData(statColumn(ptype, chunks, true), allValid.clone()));
names.add("max_is_truncated");
types.add(new DType.Bool(false));
fields.add(notTruncated);
names.add("min");
types.add(nullablePrim);
fields.add(new NullableData(statColumn(ptype, chunks, false), allValid.clone()));
names.add("min_is_truncated");
types.add(new DType.Bool(false));
fields.add(notTruncated.clone());
}
long[] nullCounts = new long[nZones];
for (int i = 0; i < nZones; i++) {
nullCounts[i] = chunks.get(i).nullCount();
}
// Field order mirrors ZonedStatsSchema.statsTableDtype for present stats MAX(3), MIN(4),
// NULL_COUNT(6): [max, max_is_truncated, min, min_is_truncated, null_count]. Every stat
// field is nullable in the reconstructed dtype, so null_count is a nullable U64.
DType.Struct statsDtype = new DType.Struct(
List.of("max", "max_is_truncated", "min", "min_is_truncated", "null_count"),
List.of(nullablePrim, new DType.Bool(false), nullablePrim, new DType.Bool(false),
new DType.Primitive(PType.U64, true)),
false);
StructData sd = new StructData(List.of(
new NullableData(statColumn(prim.ptype(), chunks, true), allValid),
notTruncated,
new NullableData(statColumn(prim.ptype(), chunks, false), allValid.clone()),
notTruncated.clone(),
new NullableData(nullCounts, allValid.clone())));
int zonesSegIdx = writeSegment(statsDtype, sd, new StructEncodingEncoder());
zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize()));
names.add("null_count");
types.add(new DType.Primitive(PType.U64, true));
fields.add(new NullableData(nullCounts, allValid.clone()));

DType.Struct statsDtype = new DType.Struct(List.copyOf(names), List.copyOf(types), false);
int zonesSegIdx = writeSegment(statsDtype, new StructData(fields), new StructEncodingEncoder());
zoneMaps.put(colName, new ZoneMapRef(zonesSegIdx, nZones, options.chunkSize(), hasMinMax));
}
}

Expand All @@ -736,16 +746,21 @@ private int wrapZoneMap(FlatBufferBuilder fbb, String colName, int dataLayout, l
int zonesSegV = Layout.createSegmentsVector(fbb, new long[]{zm.zonesSegIdx()});
int zonesFlat = Layout.createLayout(fbb, LAYOUT_FLAT, zm.nZones(), 0, 0, zonesSegV);
int childV = Layout.createChildrenVector(fbb, new int[]{dataLayout, zonesFlat});
int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen()));
int metaV = Layout.createMetadataVector(fbb, zonedMetadataBytes(zm.zoneLen(), zm.hasMinMax()));
return Layout.createLayout(fbb, LAYOUT_ZONED, colRows, metaV, childV, 0);
}

/// `vortex.stats` metadata: `u32` zone length (LE) + a 1-byte stat bitset with the MAX and
/// MIN bits set (LSB-first), matching [io.github.dfa1.vortex.inspect] `ZonedStatsSchema`.
private static byte[] zonedMetadataBytes(long zoneLen) {
/// `vortex.stats` metadata: `u32` zone length (LE) + a 1-byte stat bitset (LSB-first) with the
/// NULL_COUNT bit always set and the MAX/MIN bits set when present, matching
/// [io.github.dfa1.vortex.inspect] `ZonedStatsSchema`.
private static byte[] zonedMetadataBytes(long zoneLen, boolean hasMinMax) {
byte[] meta = new byte[5];
ByteBuffer.wrap(meta).order(ByteOrder.LITTLE_ENDIAN).putInt((int) zoneLen);
meta[4] = (byte) ((1 << STAT_MAX) | (1 << STAT_MIN) | (1 << STAT_NULL_COUNT));
int bits = 1 << STAT_NULL_COUNT;
if (hasMinMax) {
bits |= (1 << STAT_MAX) | (1 << STAT_MIN);
}
meta[4] = (byte) bits;
return meta;
}

Expand Down Expand Up @@ -1262,8 +1277,9 @@ boolean hasStats() {
}

/// Per-column zone-map: the flat segment holding the per-zone stats table, the zone
/// count (one zone per chunk), and the logical rows per zone.
private record ZoneMapRef(int zonesSegIdx, long nZones, long zoneLen) {
/// count (one zone per chunk), the logical rows per zone, and whether the table carries
/// MIN/MAX (else NULL_COUNT only).
private record ZoneMapRef(int zonesSegIdx, long nZones, long zoneLen, boolean hasMinMax) {
}

private record DictColRef(int valuesSegIdx, long valuesLen, List<Integer> codesSegIdxes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ void noChunks_emitsNoZoneMap(@TempDir Path tmp) throws IOException {
}

@Test
void chunkWithoutStats_skipsZoneMap(@TempDir Path tmp) throws IOException {
// Given a column with one normal chunk and one empty chunk (no min/max stats): not every
// chunk carries stats, so flushZoneMaps skips the column (the all-stats guard).
void chunkWithoutStats_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException {
// Given a column with one normal chunk and one empty chunk (no min/max stats): MIN/MAX is
// dropped, but NULL_COUNT is still emitted — the zone-map carries the NULL_COUNT bit only.
DType.Struct schema = new DType.Struct(
List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false);
WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false);
Expand All @@ -209,9 +209,49 @@ void chunkWithoutStats_skipsZoneMap(@TempDir Path tmp) throws IOException {
sut.writeChunk(Map.of("v", new long[]{}));
}

// When / Then
// When / Then — zoned with the NULL_COUNT-only bitset (bit 6 = 0x40)
try (VortexReader reader = VortexReader.open(file)) {
assertThat(reader.layout().children().get(0).isZoned()).isFalse();
Layout column = reader.layout().children().get(0);
assertThat(column.isZoned()).isTrue();
ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN);
assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40);
}
}

@Test
void nonPrimitiveColumn_emitsNullCountOnlyZoneMap(@TempDir Path tmp) throws IOException {
// Given a nullable Utf8 column (no min/max stats yet) across two zones of two rows:
// zone 0 = ["a", null], zone 1 = [null, null]
DType.Struct schema = new DType.Struct(
List.of("s"), List.of(new DType.Utf8(true)), false);
WriteOptions opts = new WriteOptions(2, true, 0.90, 0, false, false);
Path file = tmp.resolve("utf8.vtx");
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, schema, opts)) {
sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData(
new String[]{"a", ""}, new boolean[]{true, false})));
sut.writeChunk(Map.of("s", new io.github.dfa1.vortex.writer.encode.NullableData(
new String[]{"", ""}, new boolean[]{false, false})));
}

// When the NULL_COUNT-only stats table is decoded
try (VortexReader reader = VortexReader.open(file)) {
Layout column = reader.layout().children().get(0);
assertThat(column.isZoned()).isTrue();
ByteBuffer meta = column.metadata().duplicate().order(ByteOrder.LITTLE_ENDIAN);
assertThat(meta.get(meta.position() + 4)).isEqualTo((byte) 0x40); // NULL_COUNT only

Layout zonesFlat = column.children().get(1);
SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst());
DType.Struct statsDtype = new DType.Struct(
List.of("null_count"), List.of(new DType.Primitive(PType.U64, true)), false);
try (Arena arena = Arena.ofConfined()) {
// A single-field stats table decodes to the bare (masked) field, not a StructArray.
MaskedArray stats = (MaskedArray) reader.decodeFlatSegment(spec, statsDtype, 2, arena);
LongArray nullCount = (LongArray) stats.inner();
assertThat(nullCount.getLong(0)).isEqualTo(1);
assertThat(nullCount.getLong(1)).isEqualTo(2);
}
}
}

Expand Down