From 71bf687a2a3dc8e8b4a5611c1160e3c49212f886 Mon Sep 17 00:00:00 2001 From: Damodar Lohani Date: Thu, 21 May 2026 05:00:12 +0000 Subject: [PATCH 1/5] feat(quoting): add quoteLiteral() for atomic identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuotesIdentifiers::quote() splits identifiers on '.' to support qualified references like 'schema.table.column'. That assumption breaks for column names that legitimately contain a literal dot — the canonical example is ClickHouse's nested-array convention where 'meta.key Array(String)' is a top-level column whose name happens to include a dot. quoteLiteral() wraps + escapes a single identifier without splitting, so callers that know the identifier is atomic (column-emission contexts) can preserve the dot. --- src/Query/QuotesIdentifiers.php | 24 ++++++++++++++ src/Query/Schema.php | 2 ++ src/Query/Schema/MongoDB.php | 5 +++ .../Fixture/QuotesIdentifiersHarness.php | 1 + tests/Query/QuotesIdentifiersTest.php | 32 +++++++++++++++++++ 5 files changed, 64 insertions(+) diff --git a/src/Query/QuotesIdentifiers.php b/src/Query/QuotesIdentifiers.php index 6a4b7c6..608411d 100644 --- a/src/Query/QuotesIdentifiers.php +++ b/src/Query/QuotesIdentifiers.php @@ -41,4 +41,28 @@ protected function quote(string $identifier): string return \implode('.', $wrapped); } + + /** + * Quote a single identifier without treating dots as qualifier separators. + * + * Use when the identifier is known to be atomic — e.g. a column name in a + * CREATE TABLE definition where the dot is a literal part of the name + * rather than a `schema.table.column` separator. The canonical case is + * ClickHouse's nested-array convention (`meta.key Array(String)`) where + * `meta.key` is a single top-level column whose name contains a dot. + */ + protected function quoteLiteral(string $identifier): string + { + if ($identifier === '*') { + return '*'; + } + + if (\preg_match('/[\x00-\x1f\x7f]/', $identifier) === 1) { + throw new ValidationException('Identifier contains control character'); + } + + return $this->wrapChar + . \str_replace($this->wrapChar, $this->wrapChar . $this->wrapChar, $identifier) + . $this->wrapChar; + } } diff --git a/src/Query/Schema.php b/src/Query/Schema.php index 0604e1c..e6f9c37 100644 --- a/src/Query/Schema.php +++ b/src/Query/Schema.php @@ -26,6 +26,8 @@ public function setExecutor(Closure $executor): static abstract protected function quote(string $identifier): string; + abstract protected function quoteLiteral(string $identifier): string; + abstract protected function compileColumnType(Column $column): string; abstract protected function compileAutoIncrement(): string; diff --git a/src/Query/Schema/MongoDB.php b/src/Query/Schema/MongoDB.php index ee42be3..d3a6a9c 100644 --- a/src/Query/Schema/MongoDB.php +++ b/src/Query/Schema/MongoDB.php @@ -24,6 +24,11 @@ protected function quote(string $identifier): string return $identifier; } + protected function quoteLiteral(string $identifier): string + { + return $identifier; + } + protected function compileColumnType(Column $column): string { if ($column->userTypeName !== null) { diff --git a/tests/Query/Fixture/QuotesIdentifiersHarness.php b/tests/Query/Fixture/QuotesIdentifiersHarness.php index fd62356..a7d667c 100644 --- a/tests/Query/Fixture/QuotesIdentifiersHarness.php +++ b/tests/Query/Fixture/QuotesIdentifiersHarness.php @@ -11,5 +11,6 @@ final class QuotesIdentifiersHarness { use QuotesIdentifiers { quote as public; + quoteLiteral as public; } } diff --git a/tests/Query/QuotesIdentifiersTest.php b/tests/Query/QuotesIdentifiersTest.php index 36df425..b242a0d 100644 --- a/tests/Query/QuotesIdentifiersTest.php +++ b/tests/Query/QuotesIdentifiersTest.php @@ -4,6 +4,7 @@ use PHPUnit\Framework\TestCase; use Tests\Query\Fixture\QuotesIdentifiersHarness; +use Utopia\Query\Exception\ValidationException; final class QuotesIdentifiersTest extends TestCase { @@ -53,4 +54,35 @@ public function testStarOnlyAllowedBareInFinalSegment(): void { $this->assertSame('`a`.`b`.*', $this->wrapper->quote('a.b.*')); } + + public function testQuoteLiteralPreservesDot(): void + { + $this->assertSame('`meta.key`', $this->wrapper->quoteLiteral('meta.key')); + } + + public function testQuoteLiteralWrapsPlainIdentifier(): void + { + $this->assertSame('`plain_name`', $this->wrapper->quoteLiteral('plain_name')); + } + + public function testQuoteLiteralDoublesWrapChar(): void + { + $this->assertSame('```weird```', $this->wrapper->quoteLiteral('`weird`')); + } + + public function testQuoteLiteralPreservesBareStar(): void + { + $this->assertSame('*', $this->wrapper->quoteLiteral('*')); + } + + public function testQuoteLiteralTreatsTrailingStarAsLiteral(): void + { + $this->assertSame('`users.*`', $this->wrapper->quoteLiteral('users.*')); + } + + public function testQuoteLiteralRejectsControlCharacter(): void + { + $this->expectException(ValidationException::class); + $this->wrapper->quoteLiteral("contains\x00null"); + } } From d036e0b3de84e0990012ed5e87bbae0d11bcc8ce Mon Sep 17 00:00:00 2001 From: Damodar Lohani Date: Thu, 21 May 2026 05:00:25 +0000 Subject: [PATCH 2/5] fix(schema): use quoteLiteral() when emitting column names in CREATE/ALTER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Column names appearing in CREATE TABLE column lists, ALTER TABLE ADD/DROP/RENAME COLUMN, PRIMARY KEY/UNIQUE/FOREIGN KEY column lists, index column lists, and the ClickHouse ORDER BY / engine column-arg positions are atomic by definition — no schema/table qualifier is allowed in those positions. Use quoteLiteral() there so identifiers that contain a literal '.' (e.g. ClickHouse 'meta.key Array(String)') emit as a single backtick-wrapped token instead of being split into two segments. Qualifier-friendly call sites (table names, FROM clauses, FK refTable, view/procedure/trigger/sequence names, etc.) keep using quote() so 'schema.table' style references continue to work. Also covers the ClickHouse engine column-arg positions (ReplacingMergeTree, SummingMergeTree, CollapsingMergeTree) and COMMENT ON COLUMN, which all take atomic column names. --- src/Query/Schema.php | 26 +++++++++++++------------- src/Query/Schema/ClickHouse.php | 24 ++++++++++++------------ src/Query/Schema/MySQL.php | 4 ++-- src/Query/Schema/PostgreSQL.php | 20 ++++++++++---------- src/Query/Schema/SQLite.php | 2 +- src/Query/Schema/Trait/ForeignKeys.php | 4 ++-- tests/Query/Schema/ClickHouseTest.php | 16 ++++++++++++++++ 7 files changed, 56 insertions(+), 40 deletions(-) diff --git a/src/Query/Schema.php b/src/Query/Schema.php index e6f9c37..bf590f9 100644 --- a/src/Query/Schema.php +++ b/src/Query/Schema.php @@ -53,7 +53,7 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen $columnDefs[] = $def; if ($column->isPrimary) { - $primaryKeys[] = $this->quote($column->name); + $primaryKeys[] = $this->quoteLiteral($column->name); } if ($column->isUnique) { $uniqueColumns[] = $column->name; @@ -74,13 +74,13 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen $columnDefs[] = 'PRIMARY KEY (' . \implode(', ', $primaryKeys) . ')'; } elseif (! empty($table->compositePrimaryKey)) { $columnDefs[] = 'PRIMARY KEY (' - . \implode(', ', \array_map(fn (string $c): string => $this->quote($c), $table->compositePrimaryKey)) + . \implode(', ', \array_map(fn (string $c): string => $this->quoteLiteral($c), $table->compositePrimaryKey)) . ')'; } // Inline UNIQUE constraints for columns marked unique foreach ($uniqueColumns as $col) { - $columnDefs[] = 'UNIQUE (' . $this->quote($col) . ')'; + $columnDefs[] = 'UNIQUE (' . $this->quoteLiteral($col) . ')'; } // Table-level CHECK constraints @@ -107,9 +107,9 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen // Foreign keys foreach ($table->foreignKeys as $fk) { - $def = 'FOREIGN KEY (' . $this->quote($fk->column) . ')' + $def = 'FOREIGN KEY (' . $this->quoteLiteral($fk->column) . ')' . ' REFERENCES ' . $this->quote($fk->refTable) - . ' (' . $this->quote($fk->refColumn) . ')'; + . ' (' . $this->quoteLiteral($fk->refColumn) . ')'; if ($fk->onDelete !== null) { $def .= ' ON DELETE ' . $fk->onDelete->toSql(); } @@ -140,18 +140,18 @@ public function compileAlter(Table $table): Statement $keyword = $column->isModify ? 'MODIFY COLUMN' : 'ADD COLUMN'; $def = $keyword . ' ' . $this->compileColumnDefinition($column); if ($column->after !== null) { - $def .= ' AFTER ' . $this->quote($column->after); + $def .= ' AFTER ' . $this->quoteLiteral($column->after); } $alterations[] = $def; } foreach ($table->renameColumns as $rename) { - $alterations[] = 'RENAME COLUMN ' . $this->quote($rename->from) - . ' TO ' . $this->quote($rename->to); + $alterations[] = 'RENAME COLUMN ' . $this->quoteLiteral($rename->from) + . ' TO ' . $this->quoteLiteral($rename->to); } foreach ($table->dropColumns as $col) { - $alterations[] = 'DROP COLUMN ' . $this->quote($col); + $alterations[] = 'DROP COLUMN ' . $this->quoteLiteral($col); } foreach ($table->indexes as $index) { @@ -170,9 +170,9 @@ public function compileAlter(Table $table): Statement } foreach ($table->foreignKeys as $fk) { - $def = 'ADD FOREIGN KEY (' . $this->quote($fk->column) . ')' + $def = 'ADD FOREIGN KEY (' . $this->quoteLiteral($fk->column) . ')' . ' REFERENCES ' . $this->quote($fk->refTable) - . ' (' . $this->quote($fk->refColumn) . ')'; + . ' (' . $this->quoteLiteral($fk->refColumn) . ')'; if ($fk->onDelete !== null) { $def .= ' ON DELETE ' . $fk->onDelete->toSql(); } @@ -269,7 +269,7 @@ public function dropIndex(string $table, string $name): Statement protected function compileColumnDefinition(Column $column): string { $parts = [ - $this->quote($column->name), + $this->quoteLiteral($column->name), $this->compileColumnType($column), ]; @@ -371,7 +371,7 @@ protected function compileIndexColumns(Schema\Index $index): string $parts = []; foreach ($index->columns as $col) { - $part = $this->quote($col); + $part = $this->quoteLiteral($col); if (isset($index->collations[$col])) { $collation = $index->collations[$col]; diff --git a/src/Query/Schema/ClickHouse.php b/src/Query/Schema/ClickHouse.php index adacd7e..013f283 100644 --- a/src/Query/Schema/ClickHouse.php +++ b/src/Query/Schema/ClickHouse.php @@ -144,7 +144,7 @@ protected function compileColumnDefinition(Column $column): string } $parts = [ - $this->quote($column->name), + $this->quoteLiteral($column->name), $this->compileColumnType($column), ]; @@ -190,12 +190,12 @@ public function compileAlter(Table $table): Statement } foreach ($table->renameColumns as $rename) { - $alterations[] = 'RENAME COLUMN ' . $this->quote($rename->from) - . ' TO ' . $this->quote($rename->to); + $alterations[] = 'RENAME COLUMN ' . $this->quoteLiteral($rename->from) + . ' TO ' . $this->quoteLiteral($rename->to); } foreach ($table->dropColumns as $col) { - $alterations[] = 'DROP COLUMN ' . $this->quote($col); + $alterations[] = 'DROP COLUMN ' . $this->quoteLiteral($col); } foreach ($table->dropIndexes as $name) { @@ -246,7 +246,7 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen $columnDefs[] = $def; if ($column->isPrimary) { - $primaryKeys[] = $this->quote($column->name); + $primaryKeys[] = $this->quoteLiteral($column->name); } } @@ -255,7 +255,7 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen } if (empty($primaryKeys) && ! empty($table->compositePrimaryKey)) { - $primaryKeys = \array_map(fn (string $c): string => $this->quote($c), $table->compositePrimaryKey); + $primaryKeys = \array_map(fn (string $c): string => $this->quoteLiteral($c), $table->compositePrimaryKey); } foreach ($table->rawColumnDefs as $rawDef) { @@ -294,7 +294,7 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen $sql .= ' ORDER BY ' . $table->orderByRaw; } else { $orderBy = ! empty($table->orderBy) - ? \array_map(fn (string $c): string => $this->quote($c), $table->orderBy) + ? \array_map(fn (string $c): string => $this->quoteLiteral($c), $table->orderBy) : $primaryKeys; $sql .= ! empty($orderBy) @@ -337,7 +337,7 @@ public function compileCreate(Table $table, bool $ifNotExists = false): Statemen */ private function compileSkipIndex(Index $index): string { - $cols = \array_map(fn (string $c): string => $this->quote($c), $index->columns); + $cols = \array_map(fn (string $c): string => $this->quoteLiteral($c), $index->columns); $expr = \count($cols) === 1 ? $cols[0] : '(' . \implode(', ', $cols) . ')'; if ($index->algorithm === null) { @@ -383,16 +383,16 @@ private function compileEngine(Engine $engine, array $args): string Engine::AggregatingMergeTree => $engine->value . '()', Engine::ReplacingMergeTree => $engine->value . '(' - . (isset($args[0]) ? $this->quote($args[0]) : '') + . (isset($args[0]) ? $this->quoteLiteral($args[0]) : '') . ')', Engine::SummingMergeTree => $engine->value . '(' . (empty($args) ? '' - : \implode(', ', \array_map(fn (string $c): string => $this->quote($c), $args))) + : \implode(', ', \array_map(fn (string $c): string => $this->quoteLiteral($c), $args))) . ')', - Engine::CollapsingMergeTree => $engine->value . '(' . $this->quote($args[0]) . ')', + Engine::CollapsingMergeTree => $engine->value . '(' . $this->quoteLiteral($args[0]) . ')', Engine::ReplicatedMergeTree => $engine->value . "('" . \str_replace("'", "''", $args[0]) . "'" @@ -470,7 +470,7 @@ public function commentOnTable(string $table, string $comment): Statement public function commentOnColumn(string $table, string $column, string $comment): Statement { return new Statement( - 'ALTER TABLE ' . $this->quote($table) . ' COMMENT COLUMN ' . $this->quote($column) . " '" . str_replace(['\\', "'"], ['\\\\', "''"], $comment) . "'", + 'ALTER TABLE ' . $this->quote($table) . ' COMMENT COLUMN ' . $this->quoteLiteral($column) . " '" . str_replace(['\\', "'"], ['\\\\', "''"], $comment) . "'", [], executor: $this->executor, ); diff --git a/src/Query/Schema/MySQL.php b/src/Query/Schema/MySQL.php index 0a4eb6a..b5747de 100644 --- a/src/Query/Schema/MySQL.php +++ b/src/Query/Schema/MySQL.php @@ -101,7 +101,7 @@ public function changeColumn(string $table, string $oldName, string $newName, st { return new Statement( 'ALTER TABLE ' . $this->quote($table) - . ' CHANGE COLUMN ' . $this->quote($oldName) . ' ' . $this->quote($newName) . ' ' . $type, + . ' CHANGE COLUMN ' . $this->quoteLiteral($oldName) . ' ' . $this->quoteLiteral($newName) . ' ' . $type, [], executor: $this->executor, ); @@ -114,7 +114,7 @@ public function modifyColumn(string $table, string $name, string $type): Stateme { return new Statement( 'ALTER TABLE ' . $this->quote($table) - . ' MODIFY ' . $this->quote($name) . ' ' . $type, + . ' MODIFY ' . $this->quoteLiteral($name) . ' ' . $type, [], executor: $this->executor, ); diff --git a/src/Query/Schema/PostgreSQL.php b/src/Query/Schema/PostgreSQL.php index 01b4e6a..616886f 100644 --- a/src/Query/Schema/PostgreSQL.php +++ b/src/Query/Schema/PostgreSQL.php @@ -99,7 +99,7 @@ protected function compileUnsigned(): string protected function compileColumnDefinition(Column $column): string { $parts = [ - $this->quote($column->name), + $this->quoteLiteral($column->name), $this->compileColumnType($column), ]; @@ -149,7 +149,7 @@ protected function compileColumnDefinition(Column $column): string // PostgreSQL enum emulation via CHECK constraint if ($column->type === ColumnType::Enum && ! empty($column->enumValues)) { $values = \array_map(fn (string $v): string => "'" . \str_replace(['\\', "'"], ['\\\\', "''"], $v) . "'", $column->enumValues); - $parts[] = 'CHECK (' . $this->quote($column->name) . ' IN (' . \implode(', ', $values) . '))'; + $parts[] = 'CHECK (' . $this->quoteLiteral($column->name) . ' IN (' . \implode(', ', $values) . '))'; } if ($column->checkExpression !== null) { @@ -352,7 +352,7 @@ public function compileAlter(Table $table): Statement foreach ($table->columns as $column) { $keyword = $column->isModify ? 'ALTER COLUMN' : 'ADD COLUMN'; if ($column->isModify) { - $def = $keyword . ' ' . $this->quote($column->name) + $def = $keyword . ' ' . $this->quoteLiteral($column->name) . ' TYPE ' . $this->compileColumnType($column); } else { $def = $keyword . ' ' . $this->compileColumnDefinition($column); @@ -361,18 +361,18 @@ public function compileAlter(Table $table): Statement } foreach ($table->renameColumns as $rename) { - $alterations[] = 'RENAME COLUMN ' . $this->quote($rename->from) - . ' TO ' . $this->quote($rename->to); + $alterations[] = 'RENAME COLUMN ' . $this->quoteLiteral($rename->from) + . ' TO ' . $this->quoteLiteral($rename->to); } foreach ($table->dropColumns as $col) { - $alterations[] = 'DROP COLUMN ' . $this->quote($col); + $alterations[] = 'DROP COLUMN ' . $this->quoteLiteral($col); } foreach ($table->foreignKeys as $fk) { - $def = 'ADD FOREIGN KEY (' . $this->quote($fk->column) . ')' + $def = 'ADD FOREIGN KEY (' . $this->quoteLiteral($fk->column) . ')' . ' REFERENCES ' . $this->quote($fk->refTable) - . ' (' . $this->quote($fk->refColumn) . ')'; + . ' (' . $this->quoteLiteral($fk->refColumn) . ')'; if ($fk->onDelete !== null) { $def .= ' ON DELETE ' . $fk->onDelete->toSql(); } @@ -500,7 +500,7 @@ public function alterColumnType(string $table, string $column, string $type, str } $sql = 'ALTER TABLE ' . $this->quote($table) - . ' ALTER COLUMN ' . $this->quote($column) + . ' ALTER COLUMN ' . $this->quoteLiteral($column) . ' TYPE ' . $type; if ($using !== '') { @@ -589,7 +589,7 @@ public function commentOnTable(string $table, string $comment): Statement public function commentOnColumn(string $table, string $column, string $comment): Statement { return new Statement( - 'COMMENT ON COLUMN ' . $this->quote($table) . '.' . $this->quote($column) . " IS '" . str_replace(['\\', "'"], ['\\\\', "''"], $comment) . "'", + 'COMMENT ON COLUMN ' . $this->quote($table) . '.' . $this->quoteLiteral($column) . " IS '" . str_replace(['\\', "'"], ['\\\\', "''"], $comment) . "'", [], executor: $this->executor, ); diff --git a/src/Query/Schema/SQLite.php b/src/Query/Schema/SQLite.php index 99ede13..8ed59b3 100644 --- a/src/Query/Schema/SQLite.php +++ b/src/Query/Schema/SQLite.php @@ -62,7 +62,7 @@ protected function compileColumnDefinition(Column $column): string } $parts = [ - $this->quote($column->name), + $this->quoteLiteral($column->name), $this->compileColumnType($column), 'PRIMARY KEY', $this->compileAutoIncrement(), diff --git a/src/Query/Schema/Trait/ForeignKeys.php b/src/Query/Schema/Trait/ForeignKeys.php index 8e60ee2..6fe9cae 100644 --- a/src/Query/Schema/Trait/ForeignKeys.php +++ b/src/Query/Schema/Trait/ForeignKeys.php @@ -18,9 +18,9 @@ public function addForeignKey( ): Statement { $sql = 'ALTER TABLE ' . $this->quote($table) . ' ADD CONSTRAINT ' . $this->quote($name) - . ' FOREIGN KEY (' . $this->quote($column) . ')' + . ' FOREIGN KEY (' . $this->quoteLiteral($column) . ')' . ' REFERENCES ' . $this->quote($refTable) - . ' (' . $this->quote($refColumn) . ')'; + . ' (' . $this->quoteLiteral($refColumn) . ')'; if ($onDelete !== null) { $sql .= ' ON DELETE ' . $onDelete->toSql(); diff --git a/tests/Query/Schema/ClickHouseTest.php b/tests/Query/Schema/ClickHouseTest.php index 1c9803b..da3e548 100644 --- a/tests/Query/Schema/ClickHouseTest.php +++ b/tests/Query/Schema/ClickHouseTest.php @@ -1311,6 +1311,22 @@ public function testCreateTableArrayColumn(): void ); } + public function testCreateTableArrayWithDottedColumnName(): void + { + $schema = new Schema(); + $result = $schema->table('events') + ->bigInteger('id')->primary() + ->array('meta.key', ColumnType::String) + ->array('meta.value', ColumnType::String) + ->create(); + $this->assertBindingCount($result); + + $this->assertSame( + 'CREATE TABLE `events` (`id` Int64, `meta.key` Array(String), `meta.value` Array(String)) ENGINE = MergeTree() ORDER BY (`id`)', + $result->query, + ); + } + public function testCreateTableArrayUnsignedInteger(): void { $schema = new Schema(); From 19e02cf487495380ec17b229854c48e14f6fc643 Mon Sep 17 00:00:00 2001 From: Damodar Lohani Date: Thu, 21 May 2026 05:35:31 +0000 Subject: [PATCH 3/5] feat(clickhouse): add bulk-insert builder for FORMAT envelope payloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Routes ClickHouse's canonical `INSERT INTO FORMAT ` bulk- ingest path through the typed builder. `Builder\ClickHouse::bulkInsert()` takes a `Format` enum and an iterable of associative rows and returns a `FormattedInsertStatement` whose `->query` is the envelope and whose new `->body` field is the serialized payload — callers ship both to the ClickHouse HTTP interface without hand-assembling either side. The `Format` enum supports `JSONEachRow` (encoded with JSON_THROW_ON_ERROR | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE, one row per line, no trailing newline) and `TabSeparated` (escapes `\\`, `\t`, `\n`, `\r`; emits `\N` for null; booleans as 0/1). Empty iterables yield an empty body, which ClickHouse accepts as a zero-row ingest. Rows are materialized eagerly so the statement remains a plain readonly value object. `FormattedInsertStatement` gains an optional `?string $body` property (default null) that preserves back-compat for the existing `insertFormat()` + `insert()` envelope-only path. Callers who only want the envelope (e.g. when streaming the payload from elsewhere) keep using that path; callers who want a single typed call switch to `bulkInsert()`. --- README.md | 41 +++ src/Query/Builder/ClickHouse.php | 67 ++++ src/Query/Builder/ClickHouse/Format.php | 117 +++++++ .../ClickHouse/FormattedInsertStatement.php | 3 + .../Feature/ClickHouse/BulkInsertTest.php | 308 ++++++++++++++++++ 5 files changed, 536 insertions(+) create mode 100644 src/Query/Builder/ClickHouse/Format.php create mode 100644 tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php diff --git a/README.md b/README.md index 6e82d7a..9a7b497 100644 --- a/README.md +++ b/README.md @@ -1520,6 +1520,47 @@ $result->namedBindings; Unregistered columns fall through to value-based inference: `int → Int64`, `float → Float64`, `bool → UInt8`, `null → Nullable(String)`, `DateTimeInterface → DateTime64(3)`, everything else → `String`. Register types via `withParamType($column, $type)` or `withParamTypes($map)` whenever the inference rule doesn't match the column's ClickHouse declaration. The positional `$bindings` array is still exposed on the resulting `Statement` for callers that prefer it. +**Bulk insert** — emit the canonical `INSERT INTO
FORMAT ` envelope together with the serialized row payload in a single typed call. The returned `FormattedInsertStatement` exposes `->query` (the envelope) and `->body` (the format-specific payload) so the caller can ship both to ClickHouse's HTTP interface without hand-assembling either side: + +```php +use Utopia\Query\Builder\ClickHouse as Builder; +use Utopia\Query\Builder\ClickHouse\Format; + +$statement = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'event' => 'login', 'time' => '2024-01-01 00:00:00'], + ['id' => 2, 'event' => 'logout', 'time' => '2024-01-01 00:00:05'], + ]); + +// $statement->query +// INSERT INTO `events` (`id`, `event`, `time`) FORMAT JSONEachRow +// +// $statement->body +// {"id":1,"event":"login","time":"2024-01-01 00:00:00"} +// {"id":2,"event":"logout","time":"2024-01-01 00:00:05"} +``` + +Ship the result over the HTTP interface by passing `$statement->query` as the `?query=` parameter and `$statement->body` as the POST body. Columns are derived from the first row's keys; pass an explicit third argument to pin the order or fill missing keys with `null`: + +```php +$statement = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, $rows, ['id', 'event', 'time']); +``` + +The `Format` enum currently supports `Format::JSONEachRow` and `Format::TabSeparated`. JSONEachRow rows are encoded with `JSON_THROW_ON_ERROR | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE` (slashes and non-ASCII are preserved verbatim); TabSeparated escapes `\\`, `\t`, `\n`, `\r` and emits `\N` for `null`. An empty row iterable produces an empty body, which ClickHouse accepts as a zero-row ingest. The iterable is consumed eagerly — pass a generator if you want to defer row construction, but the serialized body is materialized in full before the statement is returned. + +For envelopes only (no body — e.g. when streaming the payload from elsewhere), the lower-level `insertFormat()` setter remains available and pairs with `insert()` as before: + +```php +$statement = (new Builder()) + ->into('events') + ->insertFormat('JSONEachRow', ['id', 'event', 'time']) + ->insert(); +// $statement->body is null; assemble the payload separately. +``` + **UPDATE** — compiles to `ALTER TABLE ... UPDATE` with mandatory WHERE: ```php diff --git a/src/Query/Builder/ClickHouse.php b/src/Query/Builder/ClickHouse.php index 06fbc1d..223bfc7 100644 --- a/src/Query/Builder/ClickHouse.php +++ b/src/Query/Builder/ClickHouse.php @@ -3,6 +3,7 @@ namespace Utopia\Query\Builder; use Utopia\Query\Builder as BaseBuilder; +use Utopia\Query\Builder\ClickHouse\Format; use Utopia\Query\Builder\ClickHouse\FormattedInsertStatement; use Utopia\Query\Builder\Feature\BitwiseAggregates; use Utopia\Query\Builder\Feature\ClickHouse\ApproximateAggregates; @@ -163,6 +164,72 @@ public function insertFormat(string $format, array $columns = []): static return $this; } + /** + * Build a single statement that carries both the `INSERT INTO
+ * FORMAT ` envelope and the serialized row payload for a + * ClickHouse bulk ingest. Returns a `FormattedInsertStatement` whose + * `->query` is the envelope and whose `->body` is the formatted + * payload to send as the HTTP request body. + * + * The target table must be set via `into()` first. Columns are derived + * from the keys of the first row when `$columns` is omitted; pass + * `$columns` explicitly to pin the order when row shapes vary or when + * an empty iterable is passed. An empty iterable produces an empty + * body — ClickHouse accepts this as a zero-row ingest. + * + * @param iterable> $rows + * @param list $columns Optional explicit column ordering. + */ + public function bulkInsert(Format $format, iterable $rows, array $columns = []): FormattedInsertStatement + { + $this->bindings = []; + $this->validateTable(); + + $materialized = []; + foreach ($rows as $row) { + /** @phpstan-ignore function.alreadyNarrowedType */ + if (!\is_array($row)) { + throw new ValidationException('bulkInsert() rows must be associative arrays.'); + } + $materialized[] = $row; + } + + if (empty($columns) && !empty($materialized)) { + $columns = \array_keys($materialized[0]); + } + + foreach ($columns as $col) { + if ($col === '') { + throw new ValidationException('Column names for bulkInsert() must be non-empty strings.'); + } + } + + $wrappedColumns = empty($columns) + ? '' + : ' (' . \implode(', ', \array_map( + fn (string $col): string => $this->resolveAndWrap($col), + $columns + )) . ')'; + + $sql = 'INSERT INTO ' . $this->quote($this->table) + . $wrappedColumns + . ' FORMAT ' . $format->value; + + $body = $format->serialize($materialized, empty($columns) ? null : $columns); + + $this->insertFormat = $format->value; + $this->insertFormatColumns = $columns; + + return new FormattedInsertStatement( + $sql, + [], + $columns, + $format->value, + $body, + executor: $this->executor, + ); + } + /** * @param array $settings */ diff --git a/src/Query/Builder/ClickHouse/Format.php b/src/Query/Builder/ClickHouse/Format.php new file mode 100644 index 0000000..5b0bc68 --- /dev/null +++ b/src/Query/Builder/ClickHouse/Format.php @@ -0,0 +1,117 @@ + FORMAT ` envelope. Each case + * knows how to serialize a row iterable into the request body that + * ClickHouse expects for that format. + */ +enum Format: string +{ + case JSONEachRow = 'JSONEachRow'; + case TabSeparated = 'TabSeparated'; + + /** + * Serialize an iterable of associative rows into the body payload for + * this format. Columns are derived from the first row; subsequent rows + * use the same column ordering. An empty iterable yields an empty + * string — ClickHouse accepts an empty body as a zero-row insert. + * + * @param iterable> $rows + * @param list|null $columns Optional explicit column ordering. When null, derived from the first row. + */ + public function serialize(iterable $rows, ?array $columns = null): string + { + return match ($this) { + self::JSONEachRow => $this->serializeJsonEachRow($rows, $columns), + self::TabSeparated => $this->serializeTabSeparated($rows, $columns), + }; + } + + /** + * @param iterable> $rows + * @param list|null $columns + */ + private function serializeJsonEachRow(iterable $rows, ?array $columns): string + { + $lines = []; + foreach ($rows as $row) { + if ($columns !== null) { + $ordered = []; + foreach ($columns as $col) { + $ordered[$col] = $row[$col] ?? null; + } + $row = $ordered; + } + + $lines[] = \json_encode( + (object) $row, + JSON_THROW_ON_ERROR | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE, + ); + } + + return \implode("\n", $lines); + } + + /** + * @param iterable> $rows + * @param list|null $columns + */ + private function serializeTabSeparated(iterable $rows, ?array $columns): string + { + $lines = []; + foreach ($rows as $row) { + $values = []; + + if ($columns === null) { + foreach ($row as $value) { + $values[] = $this->escapeTabSeparatedValue($value); + } + } else { + foreach ($columns as $col) { + $values[] = $this->escapeTabSeparatedValue($row[$col] ?? null); + } + } + + $lines[] = \implode("\t", $values); + } + + return \implode("\n", $lines); + } + + private function escapeTabSeparatedValue(mixed $value): string + { + if ($value === null) { + return '\\N'; + } + + if (\is_bool($value)) { + return $value ? '1' : '0'; + } + + if (\is_int($value) || \is_float($value)) { + return (string) $value; + } + + if (! \is_string($value)) { + if (\is_object($value) && \method_exists($value, '__toString')) { + $value = (string) $value; + } else { + throw new ValidationException('TabSeparated values must be scalar, null, or stringable. Received: ' . \get_debug_type($value)); + } + } + + return \strtr($value, [ + '\\' => '\\\\', + "\t" => '\\t', + "\n" => '\\n', + "\r" => '\\r', + ]); + } +} diff --git a/src/Query/Builder/ClickHouse/FormattedInsertStatement.php b/src/Query/Builder/ClickHouse/FormattedInsertStatement.php index 7b3c079..caa30cf 100644 --- a/src/Query/Builder/ClickHouse/FormattedInsertStatement.php +++ b/src/Query/Builder/ClickHouse/FormattedInsertStatement.php @@ -12,6 +12,7 @@ * @param list $bindings * @param list $columns * @param string $format + * @param ?string $body Serialized payload to ship as the HTTP request body alongside `$query`. Null when only the envelope query was produced (the caller assembles the body separately). * @param bool $readOnly * @param (Closure(Statement): (array|int))|null $executor */ @@ -20,6 +21,7 @@ public function __construct( array $bindings, public array $columns, public string $format, + public ?string $body = null, bool $readOnly = false, ?Closure $executor = null, ) { @@ -34,6 +36,7 @@ public function withExecutor(Closure $executor): self $this->bindings, $this->columns, $this->format, + $this->body, $this->readOnly, $executor, ); diff --git a/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php new file mode 100644 index 0000000..5c48394 --- /dev/null +++ b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php @@ -0,0 +1,308 @@ +into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'event' => 'login', 'time' => '2024-01-01 00:00:00'], + ]); + + $this->assertInstanceOf(FormattedInsertStatement::class, $result); + $this->assertSame( + 'INSERT INTO `events` (`id`, `event`, `time`) FORMAT JSONEachRow', + $result->query + ); + $this->assertSame([], $result->bindings); + $this->assertSame(['id', 'event', 'time'], $result->columns); + $this->assertSame('JSONEachRow', $result->format); + $this->assertSame( + '{"id":1,"event":"login","time":"2024-01-01 00:00:00"}', + $result->body, + ); + } + + public function testBulkInsertMultipleRowsJsonEachRow(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'event' => 'login'], + ['id' => 2, 'event' => 'logout'], + ['id' => 3, 'event' => 'view'], + ]); + + $this->assertSame( + 'INSERT INTO `events` (`id`, `event`) FORMAT JSONEachRow', + $result->query + ); + $this->assertSame( + '{"id":1,"event":"login"}' . "\n" + . '{"id":2,"event":"logout"}' . "\n" + . '{"id":3,"event":"view"}', + $result->body, + ); + $this->assertStringEndsNotWith("\n", (string) $result->body); + } + + public function testBulkInsertEmptyIterableEmitsEmptyBody(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [], ['id', 'event']); + + $this->assertSame( + 'INSERT INTO `events` (`id`, `event`) FORMAT JSONEachRow', + $result->query + ); + $this->assertSame('', $result->body); + $this->assertSame(['id', 'event'], $result->columns); + } + + public function testBulkInsertEmptyIterableWithoutColumnsOmitsColumnList(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, []); + + $this->assertSame( + 'INSERT INTO `events` FORMAT JSONEachRow', + $result->query + ); + $this->assertSame('', $result->body); + $this->assertSame([], $result->columns); + } + + public function testBulkInsertJsonEachRowEscapesSpecialCharacters(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'note' => "tab\there\nnewline\"quote\\back"], + ]); + + $this->assertSame( + '{"id":1,"note":"tab\\there\\nnewline\\"quote\\\\back"}', + $result->body, + ); + } + + public function testBulkInsertJsonEachRowPreservesUnicodeAndSlashes(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['path' => '/api/v1/events', 'label' => 'café — 日本'], + ]); + + $this->assertSame( + '{"path":"/api/v1/events","label":"café — 日本"}', + $result->body, + ); + } + + public function testBulkInsertJsonEachRowSerializesNull(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'note' => null], + ]); + + $this->assertSame('{"id":1,"note":null}', $result->body); + } + + public function testBulkInsertExplicitColumnsPinOrderAndFillMissingKeysWithNull(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert( + Format::JSONEachRow, + [ + ['id' => 1, 'event' => 'login'], + ['event' => 'view', 'id' => 2], + ['id' => 3], + ], + ['id', 'event'], + ); + + $this->assertSame(['id', 'event'], $result->columns); + $this->assertSame( + '{"id":1,"event":"login"}' . "\n" + . '{"id":2,"event":"view"}' . "\n" + . '{"id":3,"event":null}', + $result->body, + ); + } + + public function testBulkInsertPreservesLiteralDotInTableName(): void + { + $result = (new Builder()) + ->into('my.namespace') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1], + ]); + + $this->assertSame( + 'INSERT INTO `my`.`namespace` (`id`) FORMAT JSONEachRow', + $result->query + ); + } + + public function testBulkInsertTabSeparated(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::TabSeparated, [ + ['id' => 1, 'event' => 'login'], + ['id' => 2, 'event' => 'logout'], + ]); + + $this->assertSame( + 'INSERT INTO `events` (`id`, `event`) FORMAT TabSeparated', + $result->query + ); + $this->assertSame( + "1\tlogin\n2\tlogout", + $result->body, + ); + $this->assertSame('TabSeparated', $result->format); + } + + public function testBulkInsertTabSeparatedEscapesControlCharacters(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::TabSeparated, [ + ['id' => 1, 'note' => "a\tb\nc\\d"], + ]); + + $this->assertSame( + "1\ta\\tb\\nc\\\\d", + $result->body, + ); + } + + public function testBulkInsertTabSeparatedRendersNullAsBackslashN(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::TabSeparated, [ + ['id' => 1, 'note' => null], + ]); + + $this->assertSame("1\t\\N", $result->body); + } + + public function testBulkInsertTabSeparatedRendersBooleansAsZeroOne(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::TabSeparated, [ + ['id' => 1, 'active' => true, 'archived' => false], + ]); + + $this->assertSame("1\t1\t0", $result->body); + } + + public function testBulkInsertTabSeparatedRejectsArrayValues(): void + { + $this->expectException(ValidationException::class); + + (new Builder()) + ->into('events') + ->bulkInsert(Format::TabSeparated, [ + ['id' => 1, 'tags' => ['a', 'b']], + ]); + } + + public function testBulkInsertAcceptsGenerator(): void + { + $generator = (function (): iterable { + yield ['id' => 1, 'event' => 'login']; + yield ['id' => 2, 'event' => 'logout']; + })(); + + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, $generator); + + $this->assertSame( + 'INSERT INTO `events` (`id`, `event`) FORMAT JSONEachRow', + $result->query + ); + $this->assertSame( + '{"id":1,"event":"login"}' . "\n" . '{"id":2,"event":"logout"}', + $result->body, + ); + } + + public function testBulkInsertRejectsNonAssociativeRow(): void + { + $this->expectException(ValidationException::class); + + $generator = (function (): iterable { + yield 'not-a-row'; + })(); + + (new Builder()) + ->into('events') + /** @phpstan-ignore argument.type */ + ->bulkInsert(Format::JSONEachRow, $generator); + } + + public function testBulkInsertRejectsEmptyColumnName(): void + { + $this->expectException(ValidationException::class); + + (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [['id' => 1]], ['']); + } + + public function testBulkInsertRequiresTable(): void + { + $this->expectException(ValidationException::class); + + (new Builder()) + ->bulkInsert(Format::JSONEachRow, [['id' => 1]]); + } + + public function testFormattedInsertStatementWithExecutorPreservesBody(): void + { + $result = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [['id' => 1]]); + + $executor = fn (): int => 0; + $rebound = $result->withExecutor($executor); + + $this->assertInstanceOf(FormattedInsertStatement::class, $rebound); + $this->assertSame($result->query, $rebound->query); + $this->assertSame($result->bindings, $rebound->bindings); + $this->assertSame($result->columns, $rebound->columns); + $this->assertSame($result->format, $rebound->format); + $this->assertSame($result->body, $rebound->body); + } + + public function testInsertFormatEnvelopeStillEmitsNullBodyForBackCompat(): void + { + $result = (new Builder()) + ->into('events') + ->insertFormat('JSONEachRow', ['id', 'event']) + ->insert(); + + $this->assertInstanceOf(FormattedInsertStatement::class, $result); + $this->assertNull($result->body); + } +} From b26baf9f52642d00a60763d25616e885e1a982d1 Mon Sep 17 00:00:00 2001 From: Damodar Lohani Date: Thu, 21 May 2026 06:01:27 +0000 Subject: [PATCH 4/5] refactor(clickhouse): rationalize insertFormat/bulkInsert boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bulkInsert() is the recommended bulk-ingest entry point; insertFormat() stays as a lower-level setter for callers that stream the body separately. Both paths now share a single private envelope emitter (compileFormatInsertEnvelope) so they cannot diverge on table quoting, column resolution, or column-name validation. Also aligns insertFormat()+insert() with bulkInsert() by accepting the no-columns envelope form (INSERT INTO t FORMAT name) — ClickHouse treats the column list as optional, and the two paths now match. --- README.md | 2 +- src/Query/Builder/ClickHouse.php | 134 +++++++++--------- .../Feature/ClickHouse/BulkInsertTest.php | 47 ++++++ .../Feature/ClickHouse/InsertFormatTest.php | 14 +- 4 files changed, 123 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 9a7b497..3a45d89 100644 --- a/README.md +++ b/README.md @@ -1551,7 +1551,7 @@ $statement = (new Builder()) The `Format` enum currently supports `Format::JSONEachRow` and `Format::TabSeparated`. JSONEachRow rows are encoded with `JSON_THROW_ON_ERROR | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE` (slashes and non-ASCII are preserved verbatim); TabSeparated escapes `\\`, `\t`, `\n`, `\r` and emits `\N` for `null`. An empty row iterable produces an empty body, which ClickHouse accepts as a zero-row ingest. The iterable is consumed eagerly — pass a generator if you want to defer row construction, but the serialized body is materialized in full before the statement is returned. -For envelopes only (no body — e.g. when streaming the payload from elsewhere), the lower-level `insertFormat()` setter remains available and pairs with `insert()` as before: +`bulkInsert()` is the recommended entry point for FORMAT-based ingest — it covers the full envelope + body contract in one typed call. The lower-level `insertFormat()` setter pairs with `insert()` for the envelope-only path (returns `body = null`) and is retained for callers that stream the payload separately. Both paths share the same envelope emitter, so the resulting `query` is identical for equivalent inputs: ```php $statement = (new Builder()) diff --git a/src/Query/Builder/ClickHouse.php b/src/Query/Builder/ClickHouse.php index 223bfc7..40d1ee4 100644 --- a/src/Query/Builder/ClickHouse.php +++ b/src/Query/Builder/ClickHouse.php @@ -141,35 +141,12 @@ public function hint(string $hint): static } /** - * Declare a ClickHouse FORMAT pragma for the next INSERT. - * - * When a format is set, `insert()` emits - * `INSERT INTO \`t\` (\`col1\`, \`col2\`) FORMAT ` with no VALUES. - * The row payload must be streamed into the HTTP body by the caller. - * Column names are derived from the most recent `set()` call (values are - * ignored). Pass `$columns` to declare them explicitly when no `set()` - * call has been made. - * - * @param list $columns - */ - public function insertFormat(string $format, array $columns = []): static - { - if (!\preg_match('/^[A-Za-z][A-Za-z0-9_]*$/', $format)) { - throw new ValidationException('Invalid ClickHouse INSERT format: ' . $format); - } - - $this->insertFormat = $format; - $this->insertFormatColumns = $columns; - - return $this; - } - - /** - * Build a single statement that carries both the `INSERT INTO
- * FORMAT ` envelope and the serialized row payload for a - * ClickHouse bulk ingest. Returns a `FormattedInsertStatement` whose - * `->query` is the envelope and whose `->body` is the formatted - * payload to send as the HTTP request body. + * Recommended bulk-ingest entry point. Emits the canonical `INSERT INTO + *
[()] FORMAT ` envelope alongside the serialized row + * payload in a single typed call. The returned `FormattedInsertStatement` + * exposes `->query` (envelope, no bindings) and `->body` (format-specific + * payload) so the caller can ship both to ClickHouse's HTTP interface + * without hand-assembling either side. * * The target table must be set via `into()` first. Columns are derived * from the keys of the first row when `$columns` is omitted; pass @@ -182,9 +159,6 @@ public function insertFormat(string $format, array $columns = []): static */ public function bulkInsert(Format $format, iterable $rows, array $columns = []): FormattedInsertStatement { - $this->bindings = []; - $this->validateTable(); - $materialized = []; foreach ($rows as $row) { /** @phpstan-ignore function.alreadyNarrowedType */ @@ -198,22 +172,7 @@ public function bulkInsert(Format $format, iterable $rows, array $columns = []): $columns = \array_keys($materialized[0]); } - foreach ($columns as $col) { - if ($col === '') { - throw new ValidationException('Column names for bulkInsert() must be non-empty strings.'); - } - } - - $wrappedColumns = empty($columns) - ? '' - : ' (' . \implode(', ', \array_map( - fn (string $col): string => $this->resolveAndWrap($col), - $columns - )) . ')'; - - $sql = 'INSERT INTO ' . $this->quote($this->table) - . $wrappedColumns - . ' FORMAT ' . $format->value; + $sql = $this->compileFormatInsertEnvelope($format->value, $columns); $body = $format->serialize($materialized, empty($columns) ? null : $columns); @@ -230,6 +189,63 @@ public function bulkInsert(Format $format, iterable $rows, array $columns = []): ); } + /** + * Lower-level setter for the FORMAT envelope. Use `bulkInsert()` for the + * typed entry point; this method is retained for callers that need to + * stream the body payload separately (e.g. piping a pre-serialized stream + * straight into the HTTP request) — the subsequent `insert()` call emits + * the envelope only, with `body = null`. + * + * Column names are derived from the most recent `set()` call (values are + * ignored). Pass `$columns` to declare them explicitly when no `set()` + * call has been made. + * + * @param list $columns + */ + public function insertFormat(string $format, array $columns = []): static + { + if (!\preg_match('/^[A-Za-z][A-Za-z0-9_]*$/', $format)) { + throw new ValidationException('Invalid ClickHouse INSERT format: ' . $format); + } + + $this->insertFormat = $format; + $this->insertFormatColumns = $columns; + + return $this; + } + + /** + * Build the shared `INSERT INTO
[()] FORMAT ` + * envelope. Validates the table, validates column names, quotes the + * table identifier, and wraps each column via `resolveAndWrap()`. + * Resets bindings so callers don't accumulate stale values from prior + * builder operations. + * + * @param list $columns + */ + private function compileFormatInsertEnvelope(string $format, array $columns): string + { + $this->bindings = []; + $this->validateTable(); + + foreach ($columns as $col) { + if ($col === '') { + throw new ValidationException('Column names for FORMAT INSERT must be non-empty strings.'); + } + } + + $wrappedColumns = empty($columns) + ? '' + : ' (' . \implode(', ', \array_map( + fn (string $col): string => $this->resolveAndWrap($col), + $columns + )) . ')'; + + return 'INSERT INTO ' . $this->quote($this->table) + . $wrappedColumns + . ' FORMAT ' . $format; + } + /** * @param array $settings */ @@ -685,31 +701,11 @@ public function insert(): Statement return $this->applyNamedTypedBindings(parent::insert()); } - $this->bindings = []; - $this->validateTable(); - $columns = !empty($this->insertFormatColumns) ? $this->insertFormatColumns : (!empty($this->rows) ? \array_keys($this->rows[0]) : []); - if (empty($columns)) { - throw new ValidationException('No columns specified for FORMAT INSERT. Pass columns to insertFormat() or call set() before insert().'); - } - - foreach ($columns as $col) { - if ($col === '') { - throw new ValidationException('Column names for FORMAT INSERT must be non-empty strings.'); - } - } - - $wrappedColumns = \array_map( - fn (string $col): string => $this->resolveAndWrap($col), - $columns - ); - - $sql = 'INSERT INTO ' . $this->quote($this->table) - . ' (' . \implode(', ', $wrappedColumns) . ')' - . ' FORMAT ' . $format; + $sql = $this->compileFormatInsertEnvelope($format, $columns); return new FormattedInsertStatement( $sql, diff --git a/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php index 5c48394..97f009b 100644 --- a/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php +++ b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php @@ -305,4 +305,51 @@ public function testInsertFormatEnvelopeStillEmitsNullBodyForBackCompat(): void $this->assertInstanceOf(FormattedInsertStatement::class, $result); $this->assertNull($result->body); } + + public function testBulkInsertAndInsertFormatEmitIdenticalEnvelopeForSameInputs(): void + { + $bulk = (new Builder()) + ->into('events') + ->bulkInsert(Format::JSONEachRow, [ + ['id' => 1, 'event' => 'login'], + ]); + + $envelope = (new Builder()) + ->into('events') + ->insertFormat('JSONEachRow', ['id', 'event']) + ->insert(); + + $this->assertInstanceOf(FormattedInsertStatement::class, $envelope); + $this->assertSame($bulk->query, $envelope->query); + $this->assertSame($bulk->columns, $envelope->columns); + $this->assertSame($bulk->format, $envelope->format); + } + + public function testInsertFormatEnvelopeQuotesTableWithLiteralDot(): void + { + $envelope = (new Builder()) + ->into('my.namespace') + ->insertFormat('JSONEachRow', ['id']) + ->insert(); + + $bulk = (new Builder()) + ->into('my.namespace') + ->bulkInsert(Format::JSONEachRow, [['id' => 1]]); + + $this->assertSame( + 'INSERT INTO `my`.`namespace` (`id`) FORMAT JSONEachRow', + $envelope->query, + ); + $this->assertSame($envelope->query, $bulk->query); + } + + public function testInsertFormatRejectsEmptyColumnNameMatchingBulkInsert(): void + { + $this->expectException(ValidationException::class); + + (new Builder()) + ->into('events') + ->insertFormat('JSONEachRow', ['id', '']) + ->insert(); + } } diff --git a/tests/Query/Builder/Feature/ClickHouse/InsertFormatTest.php b/tests/Query/Builder/Feature/ClickHouse/InsertFormatTest.php index 6e0138c..4ea0221 100644 --- a/tests/Query/Builder/Feature/ClickHouse/InsertFormatTest.php +++ b/tests/Query/Builder/Feature/ClickHouse/InsertFormatTest.php @@ -112,14 +112,20 @@ public function testInsertFormatRejectsEmptyColumnName(): void ->insert(); } - public function testInsertFormatRequiresColumns(): void + public function testInsertFormatWithoutColumnsEmitsBareEnvelope(): void { - $this->expectException(ValidationException::class); - - (new Builder()) + $result = (new Builder()) ->into('events') ->insertFormat('JSONEachRow') ->insert(); + + $this->assertInstanceOf(FormattedInsertStatement::class, $result); + $this->assertSame( + 'INSERT INTO `events` FORMAT JSONEachRow', + $result->query + ); + $this->assertSame([], $result->columns); + $this->assertNull($result->body); } public function testInsertWithoutFormatStillEmitsValues(): void From a0eb584fab2253d20b08f96adab2a38cd9233ceb Mon Sep 17 00:00:00 2001 From: Damodar Lohani Date: Thu, 21 May 2026 06:56:39 +0000 Subject: [PATCH 5/5] refactor(clickhouse): drop builder state mutation in bulkInsert(); document Format::serialize() column contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - bulkInsert() no longer assigns to the fluent insertFormat / insertFormatColumns fields after the envelope is compiled. The compileFormatInsertEnvelope() helper is already parameterised, so those assignments were stale state — reusing a builder instance for a subsequent regular insert() previously inherited the residual format envelope. - Format::serialize() PHPDoc now spells out the $columns === null contract: ordering is derived from the keys of the first row, with no cross-row consistency check. Positional formats (TabSeparated) corrupt silently on inconsistent row shapes; named formats (JSONEachRow) tolerate reordering but the explicit $columns argument acts as a projection filter. - New tests guard both invariants: builder-reuse after bulkInsert() and the explicit-columns ordering contract on Format::serialize() across rows whose keys vary. --- src/Query/Builder/ClickHouse.php | 3 -- src/Query/Builder/ClickHouse/Format.php | 23 ++++++++-- .../Feature/ClickHouse/BulkInsertTest.php | 44 +++++++++++++++++++ 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/Query/Builder/ClickHouse.php b/src/Query/Builder/ClickHouse.php index 40d1ee4..9717e02 100644 --- a/src/Query/Builder/ClickHouse.php +++ b/src/Query/Builder/ClickHouse.php @@ -176,9 +176,6 @@ public function bulkInsert(Format $format, iterable $rows, array $columns = []): $body = $format->serialize($materialized, empty($columns) ? null : $columns); - $this->insertFormat = $format->value; - $this->insertFormatColumns = $columns; - return new FormattedInsertStatement( $sql, [], diff --git a/src/Query/Builder/ClickHouse/Format.php b/src/Query/Builder/ClickHouse/Format.php index 5b0bc68..c96354b 100644 --- a/src/Query/Builder/ClickHouse/Format.php +++ b/src/Query/Builder/ClickHouse/Format.php @@ -19,12 +19,27 @@ enum Format: string /** * Serialize an iterable of associative rows into the body payload for - * this format. Columns are derived from the first row; subsequent rows - * use the same column ordering. An empty iterable yields an empty - * string — ClickHouse accepts an empty body as a zero-row insert. + * this format. An empty iterable yields an empty string — ClickHouse + * accepts an empty body as a zero-row insert. + * + * When `$columns` is null the column ordering is derived from the keys + * of the first row encountered. Subsequent rows are serialized against + * whatever shape they themselves carry — there is no cross-row + * consistency check. The implications differ per format: + * + * - For positional formats (e.g. {@see Format::TabSeparated}) values + * are emitted in row-key order. If later rows reorder their keys the + * columns silently misalign with the envelope's column list. Pass + * `$columns` explicitly whenever row shapes are not guaranteed + * identical, or whenever the format is positional. + * - For named formats (e.g. {@see Format::JSONEachRow}) key ordering + * does not affect correctness because each value is paired with its + * key in the wire format. `$columns` still acts as a projection + * filter: rows missing a listed column receive `null`, and row keys + * outside the list are dropped. * * @param iterable> $rows - * @param list|null $columns Optional explicit column ordering. When null, derived from the first row. + * @param list|null $columns Optional explicit column ordering. When null, derived from the keys of the first row. */ public function serialize(iterable $rows, ?array $columns = null): string { diff --git a/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php index 97f009b..c69d781 100644 --- a/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php +++ b/tests/Query/Builder/Feature/ClickHouse/BulkInsertTest.php @@ -352,4 +352,48 @@ public function testInsertFormatRejectsEmptyColumnNameMatchingBulkInsert(): void ->insertFormat('JSONEachRow', ['id', '']) ->insert(); } + + public function testBulkInsertDoesNotPersistFormatStateOnBuilder(): void + { + $builder = (new Builder()) + ->into('events'); + + $builder->bulkInsert(Format::JSONEachRow, [['id' => 1]]); + + $regular = $builder + ->into('users') + ->set(['name' => 'alice']) + ->insert(); + + $this->assertNotInstanceOf(FormattedInsertStatement::class, $regular); + $this->assertStringNotContainsString('FORMAT', $regular->query); + $this->assertSame( + 'INSERT INTO `users` (`name`) VALUES (?)', + $regular->query, + ); + $this->assertSame(['alice'], $regular->bindings); + } + + public function testFormatSerializeExplicitColumnsPinOrderingAcrossInconsistentRows(): void + { + $rows = [ + ['id' => 1, 'event' => 'login'], + ['event' => 'view', 'id' => 2], + ['id' => 3], + ]; + + $tabSeparated = Format::TabSeparated->serialize($rows, ['id', 'event']); + $this->assertSame( + "1\tlogin\n2\tview\n3\t\\N", + $tabSeparated, + ); + + $jsonEachRow = Format::JSONEachRow->serialize($rows, ['id', 'event']); + $this->assertSame( + '{"id":1,"event":"login"}' . "\n" + . '{"id":2,"event":"view"}' . "\n" + . '{"id":3,"event":null}', + $jsonEachRow, + ); + } }