From ad4c3d6bac2f1b34d020c7dbdb4884914e2c4c5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:36:20 +0300 Subject: [PATCH 01/32] Bump EdgeOps console embed version to v1.0.4. Align Dockerfile, Makefile, CI build action, and dev console build script defaults. --- .env.example | 2 +- .github/actions/set-build-env/action.yml | 2 +- Dockerfile | 4 ++-- Makefile | 4 ++-- scripts/build-console-dev.js | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index 8898b64e..06b463aa 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ NODE_ENV=development # EdgeOps Console static embed (npm run build:console → dev/console/build) EDGEOPS_CONSOLE_PATH=dev/console/build # must be absolute path -EDGEOPS_CONSOLE_VERSION=v1.0.3 +EDGEOPS_CONSOLE_VERSION=v1.0.4 # EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=datasance diff --git a/.github/actions/set-build-env/action.yml b/.github/actions/set-build-env/action.yml index 7c831f55..f1ffc0e8 100644 --- a/.github/actions/set-build-env/action.yml +++ b/.github/actions/set-build-env/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | VERSION="${{ env.EDGEOPS_CONSOLE_VERSION }}" - if [ -z "$VERSION" ]; then VERSION="1.0.3"; fi + if [ -z "$VERSION" ]; then VERSION="1.0.4"; fi echo "EDGEOPS_CONSOLE_VERSION=$VERSION" >> "${GITHUB_ENV}" REPO="${{ env.EDGEOPS_CONSOLE_REPO }}" diff --git a/Dockerfile b/Dockerfile index 3cfe9399..9155d0b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM node:24-bookworm@sha256:fdddfb3e688158251943d52eba361de991548f6814007acba4917ae6b512d6be AS console-builder ARG EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console -ARG EDGEOPS_CONSOLE_VERSION=v1.0.3 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.4 ARG EDGEOPS_CONSOLE_FLAVOR=datasance RUN apt-get update \ @@ -50,7 +50,7 @@ RUN npm pack # ubi9/nodejs-24-minimal:latest — pin manifest list digest for reproducible multi-arch builds FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:cc7648f8e1c7d628e4334328a712f30ea0820787bb92836cc93e349674c689bf -ARG EDGEOPS_CONSOLE_VERSION=v1.0.3 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.4 ARG IMAGE_REGISTRY ARG OCI_SOURCE_REPO ARG CONTROLLER_DISTRIBUTION=iofog diff --git a/Makefile b/Makefile index 49685283..ba700542 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Local Docker build — mirrors CI/release build-args (see .github/actions/set-build-env). -# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.3 +# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.4 FLAVOR ?= datasance IMAGE_NAME ?= controller @@ -25,7 +25,7 @@ else $(error FLAVOR must be "datasance" or "iofog", got "$(FLAVOR)") endif -EDGEOPS_CONSOLE_VERSION ?= v1.0.3 +EDGEOPS_CONSOLE_VERSION ?= v1.0.4 IMAGE_REF = $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(DOCKER_TAG) diff --git a/scripts/build-console-dev.js b/scripts/build-console-dev.js index 118b572f..3ea5e754 100644 --- a/scripts/build-console-dev.js +++ b/scripts/build-console-dev.js @@ -9,7 +9,7 @@ const CONSOLE_DIR = path.join(DEV_DIR, 'console') const BUILD_OUT = path.join(CONSOLE_DIR, 'build') const REPO = process.env.EDGEOPS_CONSOLE_REPO || 'https://github.com/Datasance/edgeops-console' -const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.3' +const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.4' const FLAVOR = process.env.EDGEOPS_CONSOLE_FLAVOR || 'datasance' function normalizeTag (version) { From b68093313fad59aa4fed85d3ccefc1e9ffc7f702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:36:51 +0300 Subject: [PATCH 02/32] Add unified transaction runner with SQLite priority queue and DB metrics. Introduce runInTransaction, AsyncLocalStorage context reuse, busy retry, vault afterCommit helper, and remove fakeTransaction from the decorator path. --- src/data/managers/base-manager.js | 83 ++--- src/data/providers/database-provider.js | 4 +- src/decorators/transaction-decorator.js | 104 +++---- src/helpers/db-busy-retry.js | 26 +- src/helpers/db-dialect.js | 150 +++++++++ src/helpers/db-metrics.js | 118 +++++++ src/helpers/sequelize-transaction.js | 15 + src/helpers/sqlite-fog-warning.js | 39 +++ src/helpers/transaction-runner.js | 240 +++++++++++++++ src/helpers/vault-transaction-helper.js | 85 ++++++ .../decorators/transaction-decorator.test.js | 75 +++++ test/src/helpers/db-busy-retry.test.js | 61 +++- test/src/helpers/db-dialect.test.js | 40 +++ test/src/helpers/db-metrics.test.js | 38 +++ test/src/helpers/sqlite-fog-warning.test.js | 69 +++++ test/src/helpers/transaction-runner.test.js | 288 ++++++++++++++++++ 16 files changed, 1304 insertions(+), 131 deletions(-) create mode 100644 src/helpers/db-dialect.js create mode 100644 src/helpers/db-metrics.js create mode 100644 src/helpers/sequelize-transaction.js create mode 100644 src/helpers/sqlite-fog-warning.js create mode 100644 src/helpers/transaction-runner.js create mode 100644 src/helpers/vault-transaction-helper.js create mode 100644 test/src/decorators/transaction-decorator.test.js create mode 100644 test/src/helpers/db-dialect.test.js create mode 100644 test/src/helpers/db-metrics.test.js create mode 100644 test/src/helpers/sqlite-fog-warning.test.js create mode 100644 test/src/helpers/transaction-runner.test.js diff --git a/src/data/managers/base-manager.js b/src/data/managers/base-manager.js index f7e7f325..2f3660e2 100644 --- a/src/data/managers/base-manager.js +++ b/src/data/managers/base-manager.js @@ -1,7 +1,6 @@ const AppHelper = require('../../helpers/app-helper') const Errors = require('../../helpers/errors') -// TODO [when transactions concurrency issue fixed]: Transactions should be used always module.exports = class BaseManager { getEntity () { throw new Error('Not implemented getEntity method in manager') @@ -12,24 +11,20 @@ module.exports = class BaseManager { object = object || {} - const options = transaction.fakeTransaction - ? { - where: object - } - : { - where: object, - transaction - } - - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: object, + transaction + }) } findAllWithAttributes (where, attributes, transaction) { + AppHelper.checkTransaction(transaction) + return this.getEntity().findAll({ where, - attributes - }, - { transaction }) + attributes, + transaction + }) } async findOne (object, transaction) { @@ -37,36 +32,22 @@ module.exports = class BaseManager { object = object || {} - const options = transaction.fakeTransaction - ? { - where: object - } - : { - where: object, - transaction - } - - return this.getEntity().findOne(options) + return this.getEntity().findOne({ + where: object, + transaction + }) } async create (object, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? {} - : { transaction } - - return this.getEntity().create(object, options) + return this.getEntity().create(object, { transaction }) } async bulkCreate (arr, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? {} - : { transaction } - - return this.getEntity().bulkCreate(arr, options) + return this.getEntity().bulkCreate(arr, { transaction }) } async delete (data, transaction) { @@ -74,16 +55,10 @@ module.exports = class BaseManager { data = data || {} - const options = transaction.fakeTransaction - ? { - where: data - } - : { - where: data, - transaction - } - - return this.getEntity().destroy(options) + return this.getEntity().destroy({ + where: data, + transaction + }) } async update (whereData, newData, transaction) { @@ -91,26 +66,16 @@ module.exports = class BaseManager { whereData = whereData || {} - const options = transaction.fakeTransaction - ? { - where: whereData - } - : { - where: whereData, - transaction - } - - return this.getEntity().update(newData, options) + return this.getEntity().update(newData, { + where: whereData, + transaction + }) } async upsert (data, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? {} - : { transaction } - - return this.getEntity().upsert(data, options) + return this.getEntity().upsert(data, { transaction }) } async updateOrCreate (whereData, data, transaction) { diff --git a/src/data/providers/database-provider.js b/src/data/providers/database-provider.js index 95754346..331f60ea 100644 --- a/src/data/providers/database-provider.js +++ b/src/data/providers/database-provider.js @@ -206,8 +206,8 @@ class DatabaseProvider { id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, migration_version VARCHAR(255) NOT NULL, seeder_version VARCHAR(255), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ) ` break diff --git a/src/decorators/transaction-decorator.js b/src/decorators/transaction-decorator.js index 3c96a2f6..ee986d9f 100644 --- a/src/decorators/transaction-decorator.js +++ b/src/decorators/transaction-decorator.js @@ -1,85 +1,53 @@ -const cq = require('concurrent-queue') -const Transaction = require('sequelize/lib/transaction') - const { isTest } = require('../helpers/app-helper') -const { isSqliteBusyError } = require('../helpers/db-busy-retry') - -const transactionsQueue = cq() - .limit({ concurrency: 1 }) - .process((task, cb) => { - task.transaction - .apply(task.that, task.args) - .then((res) => cb(null, res)) - .catch((err) => cb(err, null)) - }) - -function transaction (f) { - const fakeTransactionObject = { fakeTransaction: true } - return function (...fArgs) { - if (isTest()) { - return f.apply(this, fArgs) - } - - if (fArgs.length > 0 && fArgs[fArgs.length - 1] instanceof Transaction) { - fArgs[fArgs.length - 1] = fakeTransactionObject - return f.apply(this, fArgs) - } else { - fArgs.push(fakeTransactionObject) - return f.apply(this, fArgs) - } - } +const { isSequelizeTransaction } = require('../helpers/sequelize-transaction') +const { + runInTransaction, + runWithTransactionContext, + PRIORITY_INTERACTIVE, + getActiveTransactionContext +} = require('../helpers/transaction-runner') + +function hasTransactionArg (args) { + return findTransactionArg(args) != null } -function queueTransaction (resolve, reject, transaction, that, retries, ...args) { - const task = { - transaction, - that, - retries, - args - } - - transactionsQueue(task, (error, success) => { - if (error === null) { - return resolve(success) - } - - if (retries < 1 || !isSqliteBusyError(error)) { - return reject(error) +function findTransactionArg (args) { + for (let i = args.length - 1; i >= 0; i--) { + if (isSequelizeTransaction(args[i])) { + return args[i] } - - queueTransaction(resolve, reject, transaction, that, retries - 1, ...args) - }) -} - -function applyTransaction (resolve, reject, transaction, that, ...args) { - transaction.apply(that, args) - .then(resolve) - .catch((error) => { - if (!isSqliteBusyError(error)) { - return reject(error) - } - - queueTransaction(resolve, reject, transaction, this, 5, ...args) - }) + } + return null } /** * @param {Function} f - Async function that accepts (..., transaction) as last argument - * @param {{ bypassQueue?: boolean }} [options] - If bypassQueue is true, run without enqueueing (so the call does not wait behind long-running queued transactions, e.g. NATS reconcile) + * @param {{ priority?: string, label?: string }} [options] */ function generateTransaction (f, options = {}) { - const { bypassQueue = false } = options - const t = transaction(f) + const priority = options.priority || PRIORITY_INTERACTIVE + const label = options.label || f.name || 'generateTransaction' + return function (...args) { if (isTest()) { - return t.apply(this, args) + return f.apply(this, args) } - if (bypassQueue) { - return Promise.resolve().then(() => t.apply(this, args)) + + if (hasTransactionArg(args)) { + const tx = findTransactionArg(args) + return runWithTransactionContext(tx, priority, () => f.apply(this, args)) } - return new Promise((resolve, reject) => { - applyTransaction(resolve, reject, t, this, ...args) - }) + + const parentCtx = getActiveTransactionContext() + if (parentCtx?.transaction) { + return runWithTransactionContext(parentCtx.transaction, parentCtx.priority, () => + f.apply(this, [...args, parentCtx.transaction])) + } + + return runInTransaction( + (transaction) => f.apply(this, [...args, transaction]), + { priority, label } + ) } } diff --git a/src/helpers/db-busy-retry.js b/src/helpers/db-busy-retry.js index 7ce06dd4..71b3a2e9 100644 --- a/src/helpers/db-busy-retry.js +++ b/src/helpers/db-busy-retry.js @@ -1,4 +1,9 @@ +const config = require('../config') +const dbMetrics = require('./db-metrics') + const DEFAULT_MAX_RETRIES = 5 +const CONFIG_DEFAULT_MAX_RETRIES = 8 +const CONFIG_DEFAULT_BASE_MS = 25 function isSqliteBusyError (error) { if (!error) { @@ -12,12 +17,25 @@ function isSqliteBusyError (error) { return messages.some((message) => message && message.indexOf('SQLITE_BUSY') !== -1) } +function sleep (ms) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +function getRetryDefaults () { + return { + maxRetries: config.get('settings.dbBusyRetryMaxAttempts', CONFIG_DEFAULT_MAX_RETRIES), + baseMs: config.get('settings.dbBusyRetryBaseMs', CONFIG_DEFAULT_BASE_MS) + } +} + /** * Retry an async DB operation when SQLite reports SQLITE_BUSY (same semantics as TransactionDecorator queue retries). * No-op for non-SQLITE_BUSY errors. Safe on mysql/postgres — busy errors never match. */ async function withDbBusyRetry (fn, options = {}) { - const maxRetries = options.maxRetries != null ? options.maxRetries : DEFAULT_MAX_RETRIES + const defaults = getRetryDefaults() + const maxRetries = options.maxRetries != null ? options.maxRetries : defaults.maxRetries + const baseMs = options.baseMs != null ? options.baseMs : defaults.baseMs let attempt = 0 while (true) { @@ -28,12 +46,18 @@ async function withDbBusyRetry (fn, options = {}) { throw error } attempt++ + dbMetrics.recordBusyRetry(options.label) + const delayMs = baseMs * Math.pow(2, attempt - 1) + Math.floor(Math.random() * baseMs) + await sleep(delayMs) } } } module.exports = { + CONFIG_DEFAULT_BASE_MS, + CONFIG_DEFAULT_MAX_RETRIES, DEFAULT_MAX_RETRIES, + getRetryDefaults, isSqliteBusyError, withDbBusyRetry } diff --git a/src/helpers/db-dialect.js b/src/helpers/db-dialect.js new file mode 100644 index 00000000..9ca6e1e7 --- /dev/null +++ b/src/helpers/db-dialect.js @@ -0,0 +1,150 @@ +const databaseProvider = require('../data/providers/database-factory') + +function getDbDialect () { + return databaseProvider.sequelize.getDialect() +} + +function supportsSkipLocked () { + const dialect = getDbDialect() + return dialect === 'mysql' || dialect === 'postgres' +} + +function quoteTableName (tableName, dialect = getDbDialect()) { + return dialect === 'postgres' ? `"${tableName}"` : `\`${tableName}\`` +} + +/** + * Claim the next eligible reconcile task row. + * sqlite: find + conditional update (serialized by global write queue). + * mysql/postgres: SELECT … FOR UPDATE SKIP LOCKED, then leader update in same tx. + */ +function buildLeaderOrStaleCondition (staleThreshold, deleteStaleThreshold) { + const { Op } = require('sequelize') + const conditions = [ + { leaderUuid: null }, + { claimedAt: { [Op.lt]: staleThreshold } } + ] + if (deleteStaleThreshold) { + conditions.push({ + [Op.and]: [ + { reason: 'delete' }, + { claimedAt: { [Op.lt]: deleteStaleThreshold } } + ] + }) + } + return { [Op.or]: conditions } +} + +function buildLeaderOrStaleSql (deleteStaleThreshold) { + if (!deleteStaleThreshold) { + return '(leader_uuid IS NULL OR claimed_at < :staleThreshold)' + } + return '(leader_uuid IS NULL OR claimed_at < :staleThreshold OR (reason = \'delete\' AND claimed_at < :deleteStaleThreshold))' +} + +async function claimNextReconcileTask ({ + Entity, + controllerUuid, + staleThreshold, + deleteStaleThreshold = null, + now, + activeStatuses, + includeNextAttemptFilter = true, + selectSql, + reloadTask +}) { + const { runInTransaction, PRIORITY_BACKGROUND } = require('./transaction-runner') + const { Op } = require('sequelize') + const sequelize = databaseProvider.sequelize + + const leaderOrStale = buildLeaderOrStaleCondition(staleThreshold, deleteStaleThreshold) + + const baseWhere = { + status: { [Op.in]: activeStatuses }, + [Op.and]: [leaderOrStale] + } + + if (includeNextAttemptFilter) { + baseWhere[Op.or] = [ + { nextAttemptAt: null }, + { nextAttemptAt: { [Op.lte]: now } } + ] + } + + return runInTransaction(async (transaction) => { + let task + + if (supportsSkipLocked()) { + const dialect = getDbDialect() + const tableName = Entity.getTableName() + const quotedTable = quoteTableName(tableName, dialect) + + const nextAttemptClause = includeNextAttemptFilter + ? 'AND (next_attempt_at IS NULL OR next_attempt_at <= :now)' + : '' + + const leaderOrStaleSql = buildLeaderOrStaleSql(deleteStaleThreshold) + const rows = await sequelize.query( + `${selectSql.replace(':table', quotedTable)} + WHERE status IN (:activeStatuses) + ${nextAttemptClause} + AND ${leaderOrStaleSql} + ORDER BY id ASC + LIMIT 1 + FOR UPDATE SKIP LOCKED`, + { + replacements: { + activeStatuses, + now, + staleThreshold, + deleteStaleThreshold + }, + type: sequelize.QueryTypes.SELECT, + transaction + } + ) + + if (!rows.length) { + return null + } + task = Entity.build(rows[0], { isNewRecord: false }) + } else { + task = await Entity.findOne({ + where: baseWhere, + order: [['id', 'ASC']], + limit: 1, + transaction + }) + if (!task) { + return null + } + } + + const [affected] = await Entity.update( + { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, + { + where: { + id: task.id, + ...buildLeaderOrStaleCondition(staleThreshold, deleteStaleThreshold) + }, + transaction + } + ) + if (affected === 0) { + return null + } + + if (reloadTask) { + return reloadTask(task.id, transaction) + } + + return Entity.findOne({ where: { id: task.id }, transaction }) + }, { priority: PRIORITY_BACKGROUND, label: 'reconcileTask.claim' }) +} + +module.exports = { + getDbDialect, + supportsSkipLocked, + quoteTableName, + claimNextReconcileTask +} diff --git a/src/helpers/db-metrics.js b/src/helpers/db-metrics.js new file mode 100644 index 00000000..847e2ad6 --- /dev/null +++ b/src/helpers/db-metrics.js @@ -0,0 +1,118 @@ +const { metrics } = require('@opentelemetry/api') + +const METER_NAME = 'iofog-controller-db' +const METER_VERSION = '1.0.0' + +let meter = null +let transactionDuration = null +let writeQueueWaitMs = null +let busyRetries = null +let connectionInvalidated = null +let sqliteFogCountWarning = null +let writeQueueDepthInteractive = null + +function getMeter () { + if (!meter) { + meter = metrics.getMeter(METER_NAME, METER_VERSION) + } + return meter +} + +function isConnectionInvalidatedError (error) { + if (!error) { + return false + } + const messages = [ + error.message, + error.parent && error.parent.message, + error.original && error.original.message + ] + return messages.some((message) => message && ( + message.indexOf('cannot rollback') !== -1 || + message.indexOf('ConnectionManager.getConnection was called after') !== -1 || + message.indexOf('Connection terminated') !== -1 || + message.indexOf('Connection lost') !== -1 || + message.indexOf('ECONNRESET') !== -1 || + message.indexOf('ECONNREFUSED') !== -1 + )) +} + +/** + * Register DB OTEL instruments and optional Sequelize connection hooks. + * @param {import('sequelize').Sequelize} [sequelize] + * @param {string} provider + * @param {{ getWriteQueueDepth?: () => { interactive: number, background: number } }} [queueReader] + */ +function initDbMetrics (_sequelize, _provider, queueReader) { + const m = getMeter() + + transactionDuration = m.createHistogram('db.transaction.duration', { + description: 'Sequelize transaction duration', + unit: 'ms' + }) + writeQueueWaitMs = m.createHistogram('db.write_queue.wait_ms', { + description: 'Time spent waiting in the SQLite write queue before execution', + unit: 'ms' + }) + busyRetries = m.createCounter('db.busy_retries', { + description: 'SQLite SQLITE_BUSY retries during transaction execution' + }) + connectionInvalidated = m.createCounter('db.connection.invalidated', { + description: 'Database connection errors indicating pool or transaction invalidation' + }) + sqliteFogCountWarning = m.createCounter('db.sqlite.fog_count_warning', { + description: 'SQLite fleet size exceeded enterprise recommended threshold' + }) + + if (queueReader && typeof queueReader.getWriteQueueDepth === 'function') { + writeQueueDepthInteractive = m.createObservableGauge('db.write_queue.depth', { + description: 'Pending SQLite write queue depth by priority lane' + }) + writeQueueDepthInteractive.addCallback((result) => { + const depth = queueReader.getWriteQueueDepth() + result.observe(depth.interactive, { priority: 'interactive' }) + result.observe(depth.background, { priority: 'background' }) + }) + } +} + +function recordTransactionDuration (attributes, durationMs) { + if (durationMs >= 0) { + transactionDuration?.record(durationMs, attributes) + } +} + +function recordWriteQueueWaitMs (priority, waitMs) { + if (waitMs >= 0) { + writeQueueWaitMs?.record(waitMs, { priority }) + } +} + +function recordBusyRetry (label) { + busyRetries?.add(1, { label: label || 'unknown' }) +} + +function recordConnectionInvalidated (provider) { + connectionInvalidated?.add(1, { provider: provider || 'unknown' }) +} + +function recordSqliteFogCountWarning () { + sqliteFogCountWarning?.add(1) +} + +function maybeRecordConnectionInvalidated (error, provider) { + if (isConnectionInvalidatedError(error)) { + recordConnectionInvalidated(provider) + } +} + +module.exports = { + initDbMetrics, + isConnectionInvalidatedError, + maybeRecordConnectionInvalidated, + recordBusyRetry, + recordConnectionInvalidated, + recordSqliteFogCountWarning, + recordTransactionDuration, + recordWriteQueueWaitMs +} diff --git a/src/helpers/sequelize-transaction.js b/src/helpers/sequelize-transaction.js new file mode 100644 index 00000000..7cdbcaba --- /dev/null +++ b/src/helpers/sequelize-transaction.js @@ -0,0 +1,15 @@ +const Transaction = require('sequelize/lib/transaction') + +function isSequelizeTransaction (value) { + if (value == null || typeof value !== 'object') { + return false + } + if (value instanceof Transaction) { + return true + } + return typeof value.commit === 'function' && typeof value.rollback === 'function' +} + +module.exports = { + isSequelizeTransaction +} diff --git a/src/helpers/sqlite-fog-warning.js b/src/helpers/sqlite-fog-warning.js new file mode 100644 index 00000000..45c3714e --- /dev/null +++ b/src/helpers/sqlite-fog-warning.js @@ -0,0 +1,39 @@ +const config = require('../config') +const models = require('../data/models') +const logger = require('../logger') +const transactionRunner = require('./transaction-runner') +const { recordSqliteFogCountWarning } = require('./db-metrics') + +const DEFAULT_THRESHOLD = 50 + +function getThreshold () { + return config.get('settings.sqliteEnterpriseFogWarningThreshold', DEFAULT_THRESHOLD) +} + +/** + * Log a soft warning when sqlite fleet size exceeds the enterprise threshold (R124). + * Does not block API — observability and operator guidance only. + */ +async function checkSqliteFogCountWarning () { + if (!transactionRunner.isSqliteProvider()) { + return + } + + const threshold = getThreshold() + const fogCount = await models.Fog.count() + if (fogCount <= threshold) { + return + } + + recordSqliteFogCountWarning() + logger.warn( + `SQLite deployment has ${fogCount} fogs (threshold ${threshold}). ` + + 'For enterprise scale and multi-user load, migrate to mysql or postgres. ' + + 'See docs/operations/database-transactions.md.' + ) +} + +module.exports = { + checkSqliteFogCountWarning, + getThreshold +} diff --git a/src/helpers/transaction-runner.js b/src/helpers/transaction-runner.js new file mode 100644 index 00000000..9005c533 --- /dev/null +++ b/src/helpers/transaction-runner.js @@ -0,0 +1,240 @@ +const { AsyncLocalStorage } = require('async_hooks') +const databaseProvider = require('../data/providers/database-factory') +const config = require('../config') +const logger = require('../logger') +const { withDbBusyRetry } = require('./db-busy-retry') +const { + maybeRecordConnectionInvalidated, + recordTransactionDuration, + recordWriteQueueWaitMs +} = require('./db-metrics') + +const PRIORITY_INTERACTIVE = 'interactive' +const PRIORITY_BACKGROUND = 'background' + +const interactiveLane = [] +const backgroundLane = [] +let workerPromise = null +let queueDepthExceededLogged = false +const activeTransactionStore = new AsyncLocalStorage() + +const queueDepth = { + interactive: 0, + background: 0 +} + +function getProviderName () { + return process.env.DB_PROVIDER || config.get('database.provider', 'sqlite') || 'sqlite' +} + +function isSqliteProvider () { + return getProviderName() === 'sqlite' +} + +function getWriteQueueMaxDepth () { + return config.get('settings.dbWriteQueueMaxDepth', 256) +} + +function updateQueueDepth () { + queueDepth.interactive = interactiveLane.length + queueDepth.background = backgroundLane.length +} + +function getWriteQueueDepth () { + return { ...queueDepth } +} + +function checkQueueBackpressure () { + const depth = getWriteQueueDepth() + const totalDepth = depth.interactive + depth.background + const maxDepth = getWriteQueueMaxDepth() + if (totalDepth <= maxDepth) { + queueDepthExceededLogged = false + return + } + if (!queueDepthExceededLogged) { + queueDepthExceededLogged = true + logger.error( + `SQLite write queue depth ${totalDepth} exceeds configured maximum ${maxDepth} ` + + `(interactive=${depth.interactive}, background=${depth.background}). ` + + 'Investigate background job pressure or migrate to mysql/postgres. ' + + 'Interactive requests are not rejected; see docs/operations/database-transactions.md.' + ) + } +} + +function dequeueNext () { + if (interactiveLane.length > 0) { + return interactiveLane.shift() + } + if (backgroundLane.length > 0) { + return backgroundLane.shift() + } + return null +} + +function getActiveTransactionContext () { + return activeTransactionStore.getStore() || null +} + +function getActiveTransaction () { + const ctx = getActiveTransactionContext() + return ctx ? ctx.transaction : null +} + +async function executeTransaction (fn, options) { + const sequelize = databaseProvider.sequelize + const provider = getProviderName() + const startedAt = Date.now() + const priority = options.priority || PRIORITY_INTERACTIVE + + try { + const result = await withDbBusyRetry( + () => sequelize.transaction((transaction) => { + return activeTransactionStore.run({ transaction, priority }, async () => fn(transaction)) + }), + options + ) + recordTransactionDuration( + { + label: options.label || 'unknown', + priority: options.priority || PRIORITY_INTERACTIVE, + provider + }, + Date.now() - startedAt + ) + return result + } catch (error) { + maybeRecordConnectionInvalidated(error, provider) + throw error + } +} + +async function runWorker () { + while (true) { + updateQueueDepth() + checkQueueBackpressure() + const item = dequeueNext() + if (!item) { + break + } + + recordWriteQueueWaitMs(item.priority, Date.now() - item.enqueuedAt) + + try { + const result = await executeTransaction(item.fn, item.retryOptions) + item.resolve(result) + } catch (error) { + item.reject(error) + } + } + workerPromise = null +} + +function ensureWorker () { + if (!workerPromise) { + workerPromise = runWorker() + } + return workerPromise +} + +function enqueueSqlite (fn, options) { + return new Promise((resolve, reject) => { + const item = { + fn, + resolve, + reject, + priority: options.priority, + enqueuedAt: Date.now(), + retryOptions: { + label: options.label, + priority: options.priority + } + } + + if (options.priority === PRIORITY_BACKGROUND) { + backgroundLane.push(item) + } else { + interactiveLane.push(item) + } + + updateQueueDepth() + checkQueueBackpressure() + ensureWorker() + }) +} + +async function runInTransactionPool (fn, options) { + return executeTransaction(fn, { + label: options.label, + priority: options.priority || PRIORITY_INTERACTIVE + }) +} + +/** + * Run a callback inside a real Sequelize transaction. + * SQLite: serialized through a global priority write queue (interactive before background). + * mysql/postgres: direct pool transaction, no global queue. + * + * On SQLite, interactive priority reuses an active ALS parent transaction when nested. + * Background priority always enqueues a fresh transaction so deferred work (e.g. audit + * events scheduled via setImmediate after a handler commit) cannot reuse a stale tx. + * + * @param {Function} fn - async (transaction) => result + * @param {{ priority?: string, label?: string }} [options] + */ +async function runInTransaction (fn, options = {}) { + const priority = options.priority || PRIORITY_INTERACTIVE + const label = options.label || 'unknown' + + if (isSqliteProvider()) { + const parentCtx = getActiveTransactionContext() + if (parentCtx && parentCtx.transaction && priority !== PRIORITY_BACKGROUND) { + return fn(parentCtx.transaction) + } + return enqueueSqlite(fn, { priority, label }) + } + + return runInTransactionPool(fn, { priority, label }) +} + +/** + * Run a callback with an existing Sequelize transaction registered in AsyncLocalStorage. + * Use whenever executing code that already holds a transaction outside executeTransaction's + * own ALS frame (e.g. generateTransaction explicit-tx and ALS-inject paths). + * + * @param {object} transaction - Sequelize transaction + * @param {string|undefined} priority - Lane priority; inherits from parent ALS when omitted + * @param {Function} fn - async (transaction) => result + */ +async function runWithTransactionContext (transaction, priority, fn) { + const parentCtx = getActiveTransactionContext() + const effectivePriority = priority ?? parentCtx?.priority ?? PRIORITY_INTERACTIVE + if (parentCtx?.transaction === transaction) { + return fn(transaction) + } + return activeTransactionStore.run({ transaction, priority: effectivePriority }, () => fn(transaction)) +} + +function _resetQueueForTests () { + interactiveLane.length = 0 + backgroundLane.length = 0 + workerPromise = null + queueDepth.interactive = 0 + queueDepth.background = 0 + queueDepthExceededLogged = false +} + +module.exports = { + PRIORITY_BACKGROUND, + PRIORITY_INTERACTIVE, + _resetQueueForTests, + getActiveTransaction, + getActiveTransactionContext, + getProviderName, + getWriteQueueDepth, + getWriteQueueMaxDepth, + isSqliteProvider, + runInTransaction, + runWithTransactionContext +} diff --git a/src/helpers/vault-transaction-helper.js b/src/helpers/vault-transaction-helper.js new file mode 100644 index 00000000..76b453e2 --- /dev/null +++ b/src/helpers/vault-transaction-helper.js @@ -0,0 +1,85 @@ +'use strict' + +const { isTest } = require('./app-helper') +const logger = require('../logger') +const SecretHelper = require('./secret-helper') +const vaultManager = require('../vault/vault-manager') +const { runInTransaction, PRIORITY_INTERACTIVE } = require('./transaction-runner') + +/** + * Run vault HTTP work after the Sequelize transaction commits (or immediately in tests). + * Failures are logged; they do not roll back the committed DB state. + */ +function scheduleVaultAfterCommit (transaction, fn, label = 'vault') { + const run = () => Promise.resolve(fn()).catch((err) => { + logger.warn(`Deferred vault work (${label}) failed: ${err.message}`) + }) + + if (transaction && typeof transaction.afterCommit === 'function') { + transaction.afterCommit(run) + return + } + + if (isTest()) { + return run() + } +} + +function shouldDeferVaultStore (secretType, useVault) { + if (secretType === 'configmap' && useVault === false) { + return false + } + return vaultManager.isEnabled() +} + +function scheduleVaultDeleteAfterCommit (transaction, secretName, secretType, label) { + if (!vaultManager.isEnabled()) { + return + } + scheduleVaultAfterCommit( + transaction, + () => SecretHelper.deleteSecret(secretName, secretType), + label || `vault.delete.${secretName}` + ) +} + +/** + * After commit: store plaintext in vault and patch the DB row with the vault reference. + */ +function scheduleVaultPromoteAfterCommit (transaction, { + secretData, + secretName, + secretType, + useVault = null, + model, + where, + field +}, label) { + if (!shouldDeferVaultStore(secretType, useVault)) { + return + } + + const promoteLabel = label || `vault.promote.${secretName}` + scheduleVaultAfterCommit(transaction, async () => { + const Model = typeof model === 'function' ? model() : model + if (!Model) { + throw new Error('Model not available for vault promote') + } + const vaultRef = await SecretHelper.storeInVaultAndGetReference( + secretData, + secretName, + secretType, + useVault + ) + await runInTransaction(async (tx) => { + await Model.update({ [field]: vaultRef }, { where, transaction: tx }) + }, { priority: PRIORITY_INTERACTIVE, label: promoteLabel }) + }, promoteLabel) +} + +module.exports = { + scheduleVaultAfterCommit, + scheduleVaultDeleteAfterCommit, + scheduleVaultPromoteAfterCommit, + shouldDeferVaultStore +} diff --git a/test/src/decorators/transaction-decorator.test.js b/test/src/decorators/transaction-decorator.test.js new file mode 100644 index 00000000..113338ba --- /dev/null +++ b/test/src/decorators/transaction-decorator.test.js @@ -0,0 +1,75 @@ +'use strict' + +const { expect } = require('chai') +const sinon = require('sinon') +const Transaction = require('sequelize/lib/transaction') + +const TransactionDecorator = require('../../../src/decorators/transaction-decorator') +const transactionRunner = require('../../../src/helpers/transaction-runner') + +describe('transaction-decorator', () => { + const sandbox = sinon.createSandbox() + const parentTransaction = Object.assign(Object.create(Transaction.prototype), { + commit: () => {}, + rollback: () => {} + }) + let originalNodeEnv + + beforeEach(() => { + originalNodeEnv = process.env.NODE_ENV + process.env.NODE_ENV = 'production' + }) + + afterEach(() => { + process.env.NODE_ENV = originalNodeEnv + sandbox.restore() + }) + + it('passes through when transaction is not the last argument', async () => { + sandbox.stub(transactionRunner, 'runInTransaction').rejects(new Error('should not enqueue')) + + async function ensureOperator (transaction, options = {}) { + return { transaction, options } + } + + const wrapped = TransactionDecorator.generateTransaction(ensureOperator) + const result = await wrapped(parentTransaction, { triggerReconcile: false }) + + expect(result.transaction).to.equal(parentTransaction) + expect(result.options).to.deep.equal({ triggerReconcile: false }) + expect(transactionRunner.runInTransaction).to.not.have.been.called + }) + + it('nested runInTransaction reuses explicit parent transaction without decorator enqueue', async () => { + const runInTransactionSpy = sandbox.spy(transactionRunner, 'runInTransaction') + + async function innerUtil () { + return transactionRunner.runInTransaction((tx) => tx) + } + + async function outer (data, transaction) { + return innerUtil() + } + + const wrapped = TransactionDecorator.generateTransaction(outer) + const result = await wrapped({}, parentTransaction) + + expect(result).to.equal(parentTransaction) + expect(runInTransactionSpy).to.have.been.calledOnce + }) + + it('registers ALS so getActiveTransactionContext sees explicit parent tx', async () => { + let ctxInsideHandler + + async function outer (data, transaction) { + ctxInsideHandler = transactionRunner.getActiveTransactionContext() + return transaction + } + + const wrapped = TransactionDecorator.generateTransaction(outer) + await wrapped({}, parentTransaction) + + expect(ctxInsideHandler).to.not.equal(null) + expect(ctxInsideHandler.transaction).to.equal(parentTransaction) + }) +}) diff --git a/test/src/helpers/db-busy-retry.test.js b/test/src/helpers/db-busy-retry.test.js index d4dc16b9..20bd8921 100644 --- a/test/src/helpers/db-busy-retry.test.js +++ b/test/src/helpers/db-busy-retry.test.js @@ -3,11 +3,24 @@ const fs = require('fs') const os = require('os') const path = require('path') const Sequelize = require('sequelize') +const sinon = require('sinon') -const { isSqliteBusyError, withDbBusyRetry } = require('../../../src/helpers/db-busy-retry') +const { + CONFIG_DEFAULT_BASE_MS, + CONFIG_DEFAULT_MAX_RETRIES, + getRetryDefaults, + isSqliteBusyError, + withDbBusyRetry +} = require('../../../src/helpers/db-busy-retry') const { registerSqlitePragmas, applySqlitePragmas } = require('../../../src/helpers/sqlite-pragmas') describe('db-busy-retry', () => { + const sandbox = sinon.createSandbox() + + afterEach(() => { + sandbox.restore() + }) + it('detects SQLITE_BUSY on nested Sequelize errors', () => { const error = { message: 'SQLITE_BUSY: database is locked', @@ -48,6 +61,35 @@ describe('db-busy-retry', () => { } }) + it('reads retry defaults from config', () => { + const defaults = getRetryDefaults() + expect(defaults.maxRetries).to.equal(CONFIG_DEFAULT_MAX_RETRIES) + expect(defaults.baseMs).to.equal(CONFIG_DEFAULT_BASE_MS) + }) + + it('waits with exponential backoff between busy retries', async () => { + const clock = sinon.useFakeTimers({ shouldAdvanceTime: true }) + let attempts = 0 + + try { + const promise = withDbBusyRetry(async () => { + attempts++ + throw new Error('SQLITE_BUSY: database is locked') + }, { maxRetries: 2, baseMs: 10 }) + + await clock.tickAsync(0) + await clock.tickAsync(10) + await clock.tickAsync(20) + await promise + throw new Error('expected throw') + } catch (error) { + expect(error.message).to.contain('SQLITE_BUSY') + expect(attempts).to.equal(3) + } finally { + clock.restore() + } + }) + it('exhausts retries and rethrows the last busy error', async () => { let attempts = 0 try { @@ -61,6 +103,23 @@ describe('db-busy-retry', () => { expect(attempts).to.equal(3) } }) + + it('records busy retry metric on SQLITE_BUSY', async () => { + const dbMetrics = require('../../../src/helpers/db-metrics') + const recordSpy = sandbox.spy(dbMetrics, 'recordBusyRetry') + + let attempts = 0 + await withDbBusyRetry(async () => { + attempts++ + if (attempts < 2) { + throw new Error('SQLITE_BUSY: database is locked') + } + return 'ok' + }, { label: 'test.busy', maxRetries: 2, baseMs: 1 }) + + expect(recordSpy.calledOnce).to.equal(true) + expect(recordSpy.firstCall.args[0]).to.equal('test.busy') + }) }) describe('sqlite lock contention regression', () => { diff --git a/test/src/helpers/db-dialect.test.js b/test/src/helpers/db-dialect.test.js new file mode 100644 index 00000000..0221b3a7 --- /dev/null +++ b/test/src/helpers/db-dialect.test.js @@ -0,0 +1,40 @@ +'use strict' + +const { expect } = require('chai') +const sinon = require('sinon') + +const dbDialect = require('../../../src/helpers/db-dialect') +const databaseProvider = require('../../../src/data/providers/database-factory') + +describe('db-dialect', () => { + const sandbox = sinon.createSandbox() + + afterEach(() => { + sandbox.restore() + }) + + describe('supportsSkipLocked', () => { + it('returns false for sqlite', () => { + sandbox.stub(databaseProvider.sequelize, 'getDialect').returns('sqlite') + expect(dbDialect.supportsSkipLocked()).to.equal(false) + }) + + it('returns true for mysql and postgres', () => { + const getDialect = sandbox.stub(databaseProvider.sequelize, 'getDialect') + getDialect.returns('mysql') + expect(dbDialect.supportsSkipLocked()).to.equal(true) + + getDialect.returns('postgres') + expect(dbDialect.supportsSkipLocked()).to.equal(true) + }) + }) + + describe('quoteTableName', () => { + it('quotes for postgres and mysql', () => { + expect(dbDialect.quoteTableName('FogPlatformReconcileTasks', 'postgres')) + .to.equal('"FogPlatformReconcileTasks"') + expect(dbDialect.quoteTableName('FogPlatformReconcileTasks', 'mysql')) + .to.equal('`FogPlatformReconcileTasks`') + }) + }) +}) diff --git a/test/src/helpers/db-metrics.test.js b/test/src/helpers/db-metrics.test.js new file mode 100644 index 00000000..99b2cfc3 --- /dev/null +++ b/test/src/helpers/db-metrics.test.js @@ -0,0 +1,38 @@ +const { expect } = require('chai') + +const { + isConnectionInvalidatedError, + maybeRecordConnectionInvalidated, + recordBusyRetry, + recordSqliteFogCountWarning, + recordTransactionDuration, + recordWriteQueueWaitMs +} = require('../../../src/helpers/db-metrics') + +describe('db-metrics', () => { + it('detects connection invalidation error patterns', () => { + expect(isConnectionInvalidatedError(new Error('cannot rollback - no transaction is active'))).to.equal(true) + expect(isConnectionInvalidatedError(new Error('Connection terminated unexpectedly'))).to.equal(true) + expect(isConnectionInvalidatedError(new Error('SQLITE_BUSY: database is locked'))).to.equal(false) + }) + + it('records transaction duration without throwing when OTEL is not initialized', () => { + expect(() => recordTransactionDuration({ label: 'test', priority: 'interactive', provider: 'sqlite' }, 12)).to.not.throw() + }) + + it('records queue wait without throwing when OTEL is not initialized', () => { + expect(() => recordWriteQueueWaitMs('background', 5)).to.not.throw() + }) + + it('records busy retry without throwing when OTEL is not initialized', () => { + expect(() => recordBusyRetry('agent.updateStatus')).to.not.throw() + }) + + it('records fog count warning without throwing when OTEL is not initialized', () => { + expect(() => recordSqliteFogCountWarning()).to.not.throw() + }) + + it('maybeRecordConnectionInvalidated is a no-op for unrelated errors', () => { + expect(() => maybeRecordConnectionInvalidated(new Error('constraint violation'), 'sqlite')).to.not.throw() + }) +}) diff --git a/test/src/helpers/sqlite-fog-warning.test.js b/test/src/helpers/sqlite-fog-warning.test.js new file mode 100644 index 00000000..d0d631d3 --- /dev/null +++ b/test/src/helpers/sqlite-fog-warning.test.js @@ -0,0 +1,69 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +describe('sqlite-fog-warning', () => { + const sandbox = sinon.createSandbox() + let models + let config + let logger + let dbMetrics + let transactionRunner + + beforeEach(() => { + models = require('../../../src/data/models') + models.Fog = { count: sandbox.stub() } + config = require('../../../src/config') + logger = require('../../../src/logger') + dbMetrics = require('../../../src/helpers/db-metrics') + transactionRunner = require('../../../src/helpers/transaction-runner') + + sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'settings.sqliteEnterpriseFogWarningThreshold') { + return 50 + } + if (key === 'database.provider') { + return 'sqlite' + } + return defaultValue + }) + sandbox.stub(transactionRunner, 'isSqliteProvider').returns(true) + sandbox.stub(logger, 'warn') + sandbox.stub(dbMetrics, 'recordSqliteFogCountWarning') + }) + + afterEach(() => { + sandbox.restore() + }) + + it('logs warning and records metric when fog count exceeds threshold', async () => { + models.Fog.count.resolves(75) + const { checkSqliteFogCountWarning } = require('../../../src/helpers/sqlite-fog-warning') + + await checkSqliteFogCountWarning() + + expect(dbMetrics.recordSqliteFogCountWarning.calledOnce).to.equal(true) + expect(logger.warn.calledOnce).to.equal(true) + expect(logger.warn.firstCall.args[0]).to.contain('75 fogs') + expect(logger.warn.firstCall.args[0]).to.contain('mysql or postgres') + }) + + it('does nothing when fog count is at or below threshold', async () => { + models.Fog.count.resolves(50) + const { checkSqliteFogCountWarning } = require('../../../src/helpers/sqlite-fog-warning') + + await checkSqliteFogCountWarning() + + expect(dbMetrics.recordSqliteFogCountWarning.called).to.equal(false) + expect(logger.warn.called).to.equal(false) + }) + + it('skips check for non-sqlite providers', async () => { + transactionRunner.isSqliteProvider.returns(false) + models.Fog.count.resolves(100) + const { checkSqliteFogCountWarning } = require('../../../src/helpers/sqlite-fog-warning') + + await checkSqliteFogCountWarning() + + expect(models.Fog.count.called).to.equal(false) + }) +}) diff --git a/test/src/helpers/transaction-runner.test.js b/test/src/helpers/transaction-runner.test.js new file mode 100644 index 00000000..7006f9c5 --- /dev/null +++ b/test/src/helpers/transaction-runner.test.js @@ -0,0 +1,288 @@ +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') +const Transaction = require('sequelize/lib/transaction') +const sinon = require('sinon') + +const databaseProvider = require('../../../src/data/providers/database-factory') +const { + PRIORITY_BACKGROUND, + PRIORITY_INTERACTIVE, + _resetQueueForTests, + getActiveTransactionContext, + getWriteQueueDepth, + runInTransaction, + runWithTransactionContext +} = require('../../../src/helpers/transaction-runner') +const { registerSqlitePragmas, applySqlitePragmas } = require('../../../src/helpers/sqlite-pragmas') + +describe('transaction-runner', () => { + const sandbox = sinon.createSandbox() + let originalDbProvider + let sequelize + let dbPath + + beforeEach(async () => { + originalDbProvider = process.env.DB_PROVIDER + delete process.env.DB_PROVIDER + _resetQueueForTests() + + dbPath = path.join(os.tmpdir(), `controller-tx-runner-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + registerSqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + await sequelize.authenticate() + await applySqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + await sequelize.query('CREATE TABLE tx_runner_test (id INTEGER PRIMARY KEY AUTOINCREMENT, label TEXT NOT NULL)') + + sandbox.stub(databaseProvider, 'sequelize').value(sequelize) + }) + + afterEach(async () => { + sandbox.restore() + _resetQueueForTests() + if (originalDbProvider === undefined) { + delete process.env.DB_PROVIDER + } else { + process.env.DB_PROVIDER = originalDbProvider + } + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore missing sidecar files */ } + } + }) + + it('exports priority constants', () => { + expect(PRIORITY_INTERACTIVE).to.equal('interactive') + expect(PRIORITY_BACKGROUND).to.equal('background') + }) + + it('passes a real Sequelize transaction to the callback on sqlite', async () => { + let seenTransaction + await runInTransaction(async (transaction) => { + seenTransaction = transaction + await sequelize.query('INSERT INTO tx_runner_test (label) VALUES (\'interactive\')', { transaction }) + }) + + expect(seenTransaction).to.be.instanceOf(Transaction) + const [rows] = await sequelize.query('SELECT label FROM tx_runner_test') + expect(rows).to.deep.equal([{ label: 'interactive' }]) + }) + + it('runs interactive tasks before queued background tasks on sqlite', async () => { + const order = [] + let releaseBg1 + + const bg1Gate = new Promise((resolve) => { + releaseBg1 = resolve + }) + + const bg1 = runInTransaction(async () => { + order.push('bg1-start') + await bg1Gate + order.push('bg1-end') + }, { priority: PRIORITY_BACKGROUND, label: 'bg1' }) + + await new Promise((resolve) => setTimeout(resolve, 20)) + + const bg2 = runInTransaction(async () => { + order.push('bg2') + }, { priority: PRIORITY_BACKGROUND, label: 'bg2' }) + + const interactive = runInTransaction(async () => { + order.push('interactive') + }, { priority: PRIORITY_INTERACTIVE, label: 'interactive' }) + + releaseBg1() + await Promise.all([bg1, interactive, bg2]) + + expect(order).to.deep.equal(['bg1-start', 'bg1-end', 'interactive', 'bg2']) + }) + + it('tracks sqlite queue depth by priority lane', async () => { + let release + const gate = new Promise((resolve) => { + release = resolve + }) + + const first = runInTransaction(async () => { + await gate + }, { priority: PRIORITY_BACKGROUND }) + + await new Promise((resolve) => setTimeout(resolve, 20)) + + runInTransaction(async () => {}, { priority: PRIORITY_INTERACTIVE }) + runInTransaction(async () => {}, { priority: PRIORITY_BACKGROUND }) + + expect(getWriteQueueDepth()).to.deep.equal({ interactive: 1, background: 1 }) + + release() + await first + }) + + it('retries on SQLITE_BUSY inside the sqlite queue worker', async () => { + const originalTransaction = sequelize.transaction.bind(sequelize) + let attempts = 0 + + sequelize.transaction = async (fn) => { + attempts++ + if (attempts === 1) { + throw new Error('SQLITE_BUSY: database is locked') + } + return originalTransaction(fn) + } + + await runInTransaction(async (transaction) => { + await sequelize.query('INSERT INTO tx_runner_test (label) VALUES (\'retried\')', { transaction }) + }) + + expect(attempts).to.be.at.least(2) + const [rows] = await sequelize.query('SELECT label FROM tx_runner_test') + expect(rows).to.deep.equal([{ label: 'retried' }]) + }) + + it('skips the global sqlite queue for mysql provider', async () => { + process.env.DB_PROVIDER = 'mysql' + + let tx1Running = false + let tx2StartedWhileTx1Running = false + const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) + + sequelize.transaction = sandbox.stub().callsFake(async (fn) => fn({})) + + await Promise.all([ + runInTransaction(async () => { + tx1Running = true + await delay(50) + tx1Running = false + }, { priority: PRIORITY_BACKGROUND }), + runInTransaction(async () => { + if (tx1Running) { + tx2StartedWhileTx1Running = true + } + }, { priority: PRIORITY_BACKGROUND }) + ]) + + expect(tx2StartedWhileTx1Running).to.equal(true) + }) + + it('reuses active sqlite transaction for nested interactive runInTransaction calls', async () => { + const order = [] + await runInTransaction(async (outerTx) => { + order.push('outer-start') + await runInTransaction(async (innerTx) => { + order.push('inner') + expect(innerTx).to.equal(outerTx) + }, { priority: PRIORITY_INTERACTIVE, label: 'nested-interactive' }) + order.push('outer-end') + }, { priority: PRIORITY_BACKGROUND, label: 'outer-background' }) + + expect(order).to.deep.equal(['outer-start', 'inner', 'outer-end']) + }) + + it('enqueues a fresh sqlite transaction for background runInTransaction after parent releases', async () => { + let outerTx + let innerTxPromise + await runInTransaction(async (transaction) => { + outerTx = transaction + innerTxPromise = runInTransaction(async (tx) => tx, { + priority: PRIORITY_BACKGROUND, + label: 'event.audit' + }) + }, { priority: PRIORITY_INTERACTIVE, label: 'handler' }) + + const innerTx = await innerTxPromise + expect(innerTx).to.be.instanceOf(Transaction) + expect(innerTx).to.not.equal(outerTx) + }) + + it('background runInTransaction ignores stale committed parent in ALS', async () => { + let committedTx + await runInTransaction(async (transaction) => { + committedTx = transaction + }, { priority: PRIORITY_INTERACTIVE, label: 'handler' }) + + await runWithTransactionContext(committedTx, PRIORITY_INTERACTIVE, async () => { + let auditTx + await runInTransaction(async (tx) => { + auditTx = tx + await sequelize.query('INSERT INTO tx_runner_test (label) VALUES (\'audit\')', { transaction: tx }) + }, { priority: PRIORITY_BACKGROUND, label: 'event.audit' }) + expect(auditTx).to.be.instanceOf(Transaction) + expect(auditTx).to.not.equal(committedTx) + }) + + const [rows] = await sequelize.query('SELECT label FROM tx_runner_test') + expect(rows).to.deep.equal([{ label: 'audit' }]) + }) + + it('skips the global sqlite queue for postgres provider', async () => { + process.env.DB_PROVIDER = 'postgres' + + let tx1Running = false + let tx2StartedWhileTx1Running = false + const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) + + sequelize.transaction = sandbox.stub().callsFake(async (fn) => fn({})) + + await Promise.all([ + runInTransaction(async () => { + tx1Running = true + await delay(50) + tx1Running = false + }), + runInTransaction(async () => { + if (tx1Running) { + tx2StartedWhileTx1Running = true + } + }) + ]) + + expect(tx2StartedWhileTx1Running).to.equal(true) + }) + + it('runWithTransactionContext skips duplicate ALS frame when same tx active', async () => { + const tx = { commit: () => {}, rollback: () => {} } + let nestedCtx + + await runWithTransactionContext(tx, PRIORITY_INTERACTIVE, async () => { + await runWithTransactionContext(tx, null, async () => { + nestedCtx = getActiveTransactionContext() + }) + }) + + expect(nestedCtx.transaction).to.equal(tx) + expect(nestedCtx.priority).to.equal(PRIORITY_INTERACTIVE) + }) + + it('runWithTransactionContext inherits priority from parent when omitted', async () => { + const tx = { commit: () => {}, rollback: () => {} } + let nestedCtx + + await runWithTransactionContext(tx, PRIORITY_BACKGROUND, async () => { + await runWithTransactionContext(tx, null, async () => { + nestedCtx = getActiveTransactionContext() + }) + }) + + expect(nestedCtx.priority).to.equal(PRIORITY_BACKGROUND) + }) +}) From fb6926eabf48fe05008bb6db576065cc5e401b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:37:31 +0300 Subject: [PATCH 03/32] Fix greenfield migrations for cross-database compatibility. Use TIMESTAMPTZ on postgres, quote MySQL reserved columns, fix TEXT unique keys, add ReconcileOutbox table, and align Sequelize models with provider timezone handling. --- .../mysql/db_migration_mysql_v3.8.0.sql | 29 +- .../postgres/db_migration_pg_v3.8.0.sql | 284 +++++++++--------- .../sqlite/db_migration_sqlite_v3.8.0.sql | 18 +- src/data/models/configMap.js | 29 +- src/data/models/index.js | 25 +- src/data/models/microserviceExecSession.js | 2 +- src/data/models/reconcileOutbox.js | 54 ++++ src/data/models/registry.js | 29 +- src/data/models/secret.js | 25 +- src/data/providers/mysql.js | 7 + src/data/providers/postgres.js | 7 + src/data/stores/sequelize-session-store.js | 2 +- 12 files changed, 348 insertions(+), 163 deletions(-) create mode 100644 src/data/models/reconcileOutbox.js diff --git a/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql b/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql index 23a5f2ea..bf90dff0 100644 --- a/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql +++ b/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql @@ -120,7 +120,7 @@ CREATE TABLE IF NOT EXISTS Fogs ( deployment_type VARCHAR(36), active_volume_mounts BIGINT DEFAULT 0, volume_mount_last_update BIGINT DEFAULT 0, - warning_message TEXT DEFAULT 'HEALTHY', + warning_message TEXT, gps_device VARCHAR(36), gps_scan_frequency INT DEFAULT 60, edge_guard_frequency INT DEFAULT 0, @@ -646,7 +646,7 @@ CREATE INDEX idx_microservice_exec_status_microservice_uuid ON MicroserviceExecS CREATE TABLE IF NOT EXISTS MicroserviceHealthChecks ( id INT AUTO_INCREMENT PRIMARY KEY NOT NULL, test TEXT, - interval BIGINT, + `interval` BIGINT, timeout BIGINT, start_period BIGINT, start_interval BIGINT, @@ -707,7 +707,7 @@ CREATE INDEX idx_microservice_log_status_session_id ON MicroserviceLogStatuses ( CREATE TABLE IF NOT EXISTS MicroserviceExecSessions ( id INT AUTO_INCREMENT PRIMARY KEY NOT NULL, microservice_uuid VARCHAR(36) NOT NULL, - session_id TEXT UNIQUE NOT NULL, + session_id VARCHAR(255) UNIQUE NOT NULL, status TEXT, user_connected BOOLEAN DEFAULT false, agent_connected BOOLEAN DEFAULT false, @@ -738,8 +738,8 @@ CREATE INDEX idx_fog_log_status_session_id ON FogLogStatuses (session_id); CREATE TABLE IF NOT EXISTS RbacRoles ( id INT AUTO_INCREMENT PRIMARY KEY, - name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'Role', + name TEXT NOT NULL, + kind TEXT, created_at DATETIME, updated_at DATETIME, UNIQUE KEY unique_name (name(255)) @@ -759,8 +759,8 @@ CREATE TABLE IF NOT EXISTS RbacRoleRules ( CREATE TABLE IF NOT EXISTS RbacRoleBindings ( id INT AUTO_INCREMENT PRIMARY KEY, - name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'RoleBinding', + name TEXT NOT NULL, + kind TEXT, role_ref TEXT NOT NULL, subjects TEXT NOT NULL, created_at DATETIME, @@ -800,7 +800,7 @@ CREATE INDEX idx_rbac_role_bindings_role_id ON RbacRoleBindings (role_id); CREATE INDEX idx_rbac_service_accounts_role_id ON RbacServiceAccounts (role_id); CREATE UNIQUE INDEX idx_rbac_service_accounts_microservice_uuid_unique ON RbacServiceAccounts (microservice_uuid); -CREATE UNIQUE INDEX idx_rbac_service_accounts_application_id_name_unique ON RbacServiceAccounts (application_id, name); +CREATE UNIQUE INDEX idx_rbac_service_accounts_application_id_name_unique ON RbacServiceAccounts (application_id, name(255)); CREATE TABLE IF NOT EXISTS ClusterControllers ( uuid VARCHAR(36) PRIMARY KEY NOT NULL, @@ -1214,6 +1214,19 @@ CREATE INDEX idx_service_platform_reconcile_tasks_name_status ON ServicePlatform CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON ServicePlatformReconcileTasks (status, claimed_at); CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON ServicePlatformReconcileTasks (next_attempt_at); +CREATE TABLE IF NOT EXISTS ReconcileOutbox ( + id INT AUTO_INCREMENT PRIMARY KEY, + kind VARCHAR(32) NOT NULL, + payload TEXT NOT NULL, + idempotency_key VARCHAR(255) NOT NULL, + created_at DATETIME, + processed_at DATETIME, + last_error TEXT, + UNIQUE KEY uk_reconcile_outbox_idempotency_key (idempotency_key) +); + +CREATE INDEX idx_reconcile_outbox_unprocessed ON ReconcileOutbox (processed_at, id); + CREATE TABLE IF NOT EXISTS HubRouterConfigLocks ( id INT PRIMARY KEY, leader_uuid VARCHAR(36), diff --git a/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql b/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql index 19714e68..fdc07499 100644 --- a/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql +++ b/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql @@ -4,8 +4,8 @@ CREATE TABLE IF NOT EXISTS "Applications" ( description VARCHAR(255) DEFAULT '', is_activated BOOLEAN DEFAULT false, is_system BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), nats_access BOOLEAN DEFAULT false, nats_rule_id INTEGER ); @@ -111,14 +111,14 @@ CREATE TABLE IF NOT EXISTS "Fogs" ( is_system BOOLEAN DEFAULT FALSE, router_id INT DEFAULT 0, time_zone VARCHAR(36) DEFAULT 'Etc/UTC', - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), arch_id INT DEFAULT 0, container_engine VARCHAR(36), deployment_type VARCHAR(36), active_volume_mounts BIGINT DEFAULT 0, volume_mount_last_update BIGINT DEFAULT 0, - warning_message TEXT DEFAULT 'HEALTHY', + warning_message TEXT, gps_device VARCHAR(36), gps_scan_frequency INT DEFAULT 60, edge_guard_frequency INT DEFAULT 0, @@ -176,8 +176,8 @@ CREATE INDEX idx_fog_version_commands_iofogUuid ON "FogVersionCommands" (iofog_u CREATE TABLE IF NOT EXISTS "HWInfos" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, info TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), iofog_uuid VARCHAR(36), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -187,8 +187,8 @@ CREATE INDEX idx_hw_infos_iofogUuid ON "HWInfos" (iofog_uuid); CREATE TABLE IF NOT EXISTS "USBInfos" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, info TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), iofog_uuid VARCHAR(36), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -219,8 +219,8 @@ CREATE TABLE IF NOT EXISTS "Microservices" ( log_size BIGINT DEFAULT 0, delete BOOLEAN DEFAULT false, delete_with_cleanup BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), catalog_item_id INT, registry_id INT DEFAULT 1, iofog_uuid VARCHAR(36), @@ -298,8 +298,8 @@ CREATE TABLE IF NOT EXISTS "MicroservicePorts" ( port_internal INT, port_external INT, is_udp BOOLEAN, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), microservice_uuid VARCHAR(36), FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE CASCADE ); @@ -317,8 +317,8 @@ CREATE TABLE IF NOT EXISTS "MicroserviceStatuses" ( percentage DOUBLE PRECISION DEFAULT 0.00, error_message TEXT, microservice_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), ip_address TEXT, exec_session_ids TEXT, health_status TEXT, @@ -383,8 +383,8 @@ CREATE TABLE IF NOT EXISTS "Routers" ( host TEXT, is_default BOOLEAN DEFAULT false, iofog_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -394,8 +394,8 @@ CREATE TABLE "RouterConnections" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, source_router INT, dest_router INT, - created_at TIMESTAMP(0) NOT NULL, - updated_at TIMESTAMP(0) NOT NULL, + created_at TIMESTAMPTZ(0) NOT NULL, + updated_at TIMESTAMPTZ(0) NOT NULL, FOREIGN KEY (source_router) REFERENCES "Routers"(id) ON DELETE CASCADE, FOREIGN KEY (dest_router) REFERENCES "Routers"(id) ON DELETE CASCADE ); @@ -407,8 +407,8 @@ CREATE TABLE IF NOT EXISTS "Config" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, key VARCHAR(255) NOT NULL UNIQUE, value VARCHAR(255) NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_config_key ON "Config" (key); @@ -435,8 +435,8 @@ CREATE TABLE IF NOT EXISTS "ApplicationTemplates" ( description VARCHAR(255) DEFAULT '', schema_version VARCHAR(255) DEFAULT '', application_json TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "ApplicationTemplateVariables" ( @@ -445,8 +445,8 @@ CREATE TABLE IF NOT EXISTS "ApplicationTemplateVariables" ( key TEXT, description VARCHAR(255) DEFAULT '', default_value VARCHAR(255), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (application_template_id) REFERENCES "ApplicationTemplates" (id) ON DELETE CASCADE ); @@ -504,8 +504,8 @@ CREATE TABLE IF NOT EXISTS "FogPublicKeys" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, public_key TEXT, iofog_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -516,8 +516,8 @@ CREATE TABLE IF NOT EXISTS "FogUsedTokens" ( jti VARCHAR(255) NOT NULL, iofog_uuid VARCHAR(36), expiry_time BIGINT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -528,8 +528,8 @@ CREATE TABLE IF NOT EXISTS "Secrets" ( name VARCHAR(255) UNIQUE NOT NULL, type VARCHAR(50) NOT NULL CHECK (type IN ('Opaque', 'tls')), data TEXT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_secrets_name ON "Secrets" (name); @@ -541,12 +541,12 @@ CREATE TABLE IF NOT EXISTS "Certificates" ( is_ca BOOLEAN DEFAULT false, signed_by_id INT, hosts TEXT, - valid_from TIMESTAMP(0) NOT NULL, - valid_to TIMESTAMP(0) NOT NULL, + valid_from TIMESTAMPTZ(0) NOT NULL, + valid_to TIMESTAMPTZ(0) NOT NULL, serial_number TEXT NOT NULL, secret_id INT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (signed_by_id) REFERENCES "Certificates" (id) ON DELETE SET NULL, FOREIGN KEY (secret_id) REFERENCES "Secrets" (id) ON DELETE CASCADE ); @@ -568,8 +568,8 @@ CREATE TABLE IF NOT EXISTS "Services" ( bridge_port INTEGER, default_bridge TEXT, service_endpoint TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), provisioning_status VARCHAR(36) DEFAULT 'pending', provisioning_error TEXT ); @@ -581,8 +581,8 @@ CREATE TABLE IF NOT EXISTS "ServiceTags" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, service_id INTEGER NOT NULL, tag_id INTEGER NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (service_id) REFERENCES "Services" (id) ON DELETE CASCADE, FOREIGN KEY (tag_id) REFERENCES "Tags" (id) ON DELETE CASCADE ); @@ -595,8 +595,8 @@ CREATE TABLE IF NOT EXISTS "ConfigMaps" ( name VARCHAR(255) UNIQUE NOT NULL, immutable BOOLEAN DEFAULT false, data TEXT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), use_vault BOOLEAN DEFAULT true ); @@ -608,8 +608,8 @@ CREATE TABLE IF NOT EXISTS "VolumeMounts" ( config_map_name VARCHAR(255), secret_name VARCHAR(255), version INT DEFAULT 1, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (config_map_name) REFERENCES "ConfigMaps" (name) ON DELETE CASCADE, FOREIGN KEY (secret_name) REFERENCES "Secrets" (name) ON DELETE CASCADE ); @@ -634,8 +634,8 @@ CREATE TABLE IF NOT EXISTS "MicroserviceExecStatuses" ( status VARCHAR(255) DEFAULT 'INACTIVE', exec_session_id VARCHAR(255), microservice_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE CASCADE ); @@ -650,8 +650,8 @@ CREATE TABLE IF NOT EXISTS "MicroserviceHealthChecks" ( start_interval DOUBLE PRECISION, retries INT, microservice_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE CASCADE ); @@ -672,8 +672,8 @@ CREATE TABLE IF NOT EXISTS "Events" ( status_code INT, status_message TEXT, request_id VARCHAR(255), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_events_timestamp ON "Events" (timestamp); @@ -694,8 +694,8 @@ CREATE TABLE IF NOT EXISTS "MicroserviceLogStatuses" ( tail_config TEXT, agent_connected BOOLEAN DEFAULT false, user_connected BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE CASCADE ); @@ -709,8 +709,8 @@ CREATE TABLE IF NOT EXISTS "MicroserviceExecSessions" ( status TEXT, user_connected BOOLEAN DEFAULT false, agent_connected BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE CASCADE ); @@ -726,8 +726,8 @@ CREATE TABLE IF NOT EXISTS "FogLogStatuses" ( tail_config TEXT, agent_connected BOOLEAN DEFAULT false, user_connected BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -737,9 +737,9 @@ CREATE INDEX idx_fog_log_status_session_id ON "FogLogStatuses" (session_id); CREATE TABLE IF NOT EXISTS "RbacRoles" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'Role', - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + kind TEXT, + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "RbacRoleRules" ( @@ -749,19 +749,19 @@ CREATE TABLE IF NOT EXISTS "RbacRoleRules" ( resources TEXT NOT NULL, verbs TEXT NOT NULL, resource_names TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (role_id) REFERENCES "RbacRoles" (id) ON DELETE CASCADE ); CREATE TABLE IF NOT EXISTS "RbacRoleBindings" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'RoleBinding', + kind TEXT, role_ref TEXT NOT NULL, subjects TEXT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), role_id INTEGER ); @@ -772,8 +772,8 @@ CREATE TABLE IF NOT EXISTS "RbacServiceAccounts" ( role_id INT REFERENCES "RbacRoles" (id), microservice_uuid VARCHAR(36) REFERENCES "Microservices" (uuid) ON DELETE CASCADE, application_id INT REFERENCES "Applications" (id) ON DELETE SET NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_rbac_role_rules_role_id ON "RbacRoleRules" (role_id); @@ -784,8 +784,8 @@ CREATE INDEX idx_rbac_service_accounts_name ON "RbacServiceAccounts" (name); CREATE TABLE IF NOT EXISTS "RbacCacheVersion" ( id INT PRIMARY KEY DEFAULT 1, version BIGINT NOT NULL DEFAULT 1, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), CONSTRAINT single_row CHECK (id = 1) ); @@ -799,10 +799,10 @@ CREATE TABLE IF NOT EXISTS "ClusterControllers" ( uuid VARCHAR(36) PRIMARY KEY NOT NULL, host VARCHAR(255), process_id INT, - last_heartbeat TIMESTAMP(0), + last_heartbeat TIMESTAMPTZ(0), is_active BOOLEAN DEFAULT true, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_cluster_controllers_uuid ON "ClusterControllers" (uuid); @@ -815,8 +815,8 @@ CREATE TABLE IF NOT EXISTS "NatsOperators" ( public_key TEXT NOT NULL, jwt TEXT NOT NULL, seed_secret_name TEXT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "NatsAccounts" ( @@ -829,8 +829,8 @@ CREATE TABLE IF NOT EXISTS "NatsAccounts" ( is_leaf_system BOOLEAN DEFAULT false, operator_id INT NOT NULL, application_id INT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (operator_id) REFERENCES "NatsOperators" (id) ON DELETE CASCADE, FOREIGN KEY (application_id) REFERENCES "Applications" (id) ON DELETE CASCADE ); @@ -844,8 +844,8 @@ CREATE TABLE IF NOT EXISTS "NatsUsers" ( is_bearer BOOLEAN DEFAULT false, account_id INT NOT NULL, microservice_uuid VARCHAR(36), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), nats_user_rule_id INTEGER, FOREIGN KEY (account_id) REFERENCES "NatsAccounts" (id) ON DELETE CASCADE, FOREIGN KEY (microservice_uuid) REFERENCES "Microservices" (uuid) ON DELETE SET NULL @@ -867,8 +867,8 @@ CREATE TABLE IF NOT EXISTS "NatsInstances" ( cert_secret_name TEXT, js_storage_size TEXT, js_memory_store_size TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (iofog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -876,8 +876,8 @@ CREATE TABLE IF NOT EXISTS "NatsConnections" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, source_nats INT NOT NULL, dest_nats INT NOT NULL, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (source_nats) REFERENCES "NatsInstances" (id) ON DELETE CASCADE, FOREIGN KEY (dest_nats) REFERENCES "NatsInstances" (id) ON DELETE CASCADE ); @@ -891,9 +891,9 @@ CREATE TABLE IF NOT EXISTS "NatsReconcileTasks" ( fog_uuids TEXT, status VARCHAR(32) NOT NULL DEFAULT 'pending', leader_uuid VARCHAR(36), - claimed_at TIMESTAMP(0), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + claimed_at TIMESTAMPTZ(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "NatsAccountRules" ( @@ -928,8 +928,8 @@ CREATE TABLE IF NOT EXISTS "NatsAccountRules" ( pub_deny TEXT, sub_allow TEXT, sub_deny TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "NatsUserRules" ( @@ -952,8 +952,8 @@ CREATE TABLE IF NOT EXISTS "NatsUserRules" ( sub_allow TEXT, sub_deny TEXT, tags TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE UNIQUE INDEX idx_nats_accounts_application_id_unique ON "NatsAccounts" (application_id) WHERE application_id IS NOT NULL; @@ -984,10 +984,10 @@ CREATE TABLE IF NOT EXISTS "AuthUsers" ( must_change_password BOOLEAN DEFAULT false, is_bootstrap BOOLEAN DEFAULT false, failed_attempts INT DEFAULT 0, - locked_until TIMESTAMP(0), - deleted_at TIMESTAMP(0), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + locked_until TIMESTAMPTZ(0), + deleted_at TIMESTAMPTZ(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_auth_users_email ON "AuthUsers" (email); @@ -998,15 +998,15 @@ CREATE TABLE IF NOT EXISTS "AuthGroups" ( name VARCHAR(255) NOT NULL UNIQUE, is_system BOOLEAN DEFAULT false, mfa_required BOOLEAN NOT NULL DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "AuthUserGroups" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, user_id VARCHAR(36) NOT NULL, group_id INT NOT NULL, - created_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), FOREIGN KEY (user_id) REFERENCES "AuthUsers" (id) ON DELETE CASCADE, FOREIGN KEY (group_id) REFERENCES "AuthGroups" (id) ON DELETE CASCADE, UNIQUE (user_id, group_id) @@ -1021,8 +1021,8 @@ CREATE TABLE IF NOT EXISTS "AuthMfa" ( totp_secret_encrypted TEXT, enabled BOOLEAN DEFAULT false, recovery_codes_hash TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (user_id) REFERENCES "AuthUsers" (id) ON DELETE CASCADE ); @@ -1031,8 +1031,8 @@ CREATE INDEX idx_auth_mfa_user_id ON "AuthMfa" (user_id); CREATE TABLE IF NOT EXISTS "AuthPasswordResetSessions" ( id VARCHAR(36) PRIMARY KEY NOT NULL, user_id VARCHAR(36) NOT NULL, - expires_at TIMESTAMP(0) NOT NULL, - created_at TIMESTAMP(0), + expires_at TIMESTAMPTZ(0) NOT NULL, + created_at TIMESTAMPTZ(0), FOREIGN KEY (user_id) REFERENCES "AuthUsers" (id) ON DELETE CASCADE ); @@ -1044,10 +1044,10 @@ CREATE TABLE IF NOT EXISTS "AuthRefreshTokens" ( token_hash VARCHAR(255) NOT NULL, user_id VARCHAR(36) NOT NULL, family_id VARCHAR(36) NOT NULL, - expires_at TIMESTAMP(0) NOT NULL, + expires_at TIMESTAMPTZ(0) NOT NULL, revoked BOOLEAN DEFAULT false, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (user_id) REFERENCES "AuthUsers" (id) ON DELETE CASCADE ); @@ -1062,8 +1062,8 @@ CREATE TABLE IF NOT EXISTS "AuthOidcKeys" ( key_material_encrypted TEXT, vault_ref TEXT, active BOOLEAN DEFAULT true, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE INDEX idx_auth_oidc_keys_active ON "AuthOidcKeys" (active); @@ -1073,8 +1073,8 @@ CREATE TABLE IF NOT EXISTS "AuthOidcClients" ( client_id VARCHAR(255) NOT NULL UNIQUE, secret_ref TEXT, client_type VARCHAR(32) NOT NULL DEFAULT 'confidential', - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "AuthOidcProviderStates" ( @@ -1082,14 +1082,14 @@ CREATE TABLE IF NOT EXISTS "AuthOidcProviderStates" ( model VARCHAR(64) NOT NULL, record_id VARCHAR(255) NOT NULL, payload TEXT NOT NULL, - expires_at TIMESTAMP(0), + expires_at TIMESTAMPTZ(0), grant_id VARCHAR(255), uid VARCHAR(255), user_code VARCHAR(255), consumed BOOLEAN DEFAULT false, - consumed_at TIMESTAMP(0), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + consumed_at TIMESTAMPTZ(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), UNIQUE (model, record_id) ); @@ -1101,9 +1101,9 @@ CREATE INDEX idx_auth_oidc_provider_states_expires_at ON "AuthOidcProviderStates CREATE TABLE IF NOT EXISTS "AuthBffSessions" ( sid VARCHAR(255) PRIMARY KEY NOT NULL, data TEXT NOT NULL, - expires_at TIMESTAMP NOT NULL, - created_at TIMESTAMP, - updated_at TIMESTAMP + expires_at TIMESTAMPTZ NOT NULL, + created_at TIMESTAMPTZ, + updated_at TIMESTAMPTZ ); CREATE INDEX idx_auth_bff_sessions_expires_at ON "AuthBffSessions" (expires_at); @@ -1111,20 +1111,20 @@ CREATE INDEX idx_auth_bff_sessions_expires_at ON "AuthBffSessions" (expires_at); CREATE TABLE IF NOT EXISTS "AuthInteractionStates" ( uid VARCHAR(255) PRIMARY KEY NOT NULL, payload TEXT NOT NULL, - expires_at TIMESTAMP NOT NULL, - created_at TIMESTAMP, - updated_at TIMESTAMP + expires_at TIMESTAMPTZ NOT NULL, + created_at TIMESTAMPTZ, + updated_at TIMESTAMPTZ ); CREATE INDEX idx_auth_interaction_states_expires_at ON "AuthInteractionStates" (expires_at); CREATE TABLE IF NOT EXISTS "AuthBootstrapMeta" ( id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, - completed_at TIMESTAMP(0), + completed_at TIMESTAMPTZ(0), bootstrap_admin_user_id VARCHAR(36), session_secret_ref TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (bootstrap_admin_user_id) REFERENCES "AuthUsers" (id) ON DELETE SET NULL ); @@ -1142,16 +1142,16 @@ CREATE TABLE IF NOT EXISTS "AuthPolicy" ( refresh_token_ttl_seconds INT DEFAULT 3600, refresh_rotation BOOLEAN DEFAULT true, max_concurrent_sessions INT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE TABLE IF NOT EXISTS "FogPlatformSpecs" ( fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, spec_json TEXT NOT NULL, generation INT NOT NULL DEFAULT 1, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -1160,10 +1160,10 @@ CREATE TABLE IF NOT EXISTS "FogPlatformStatuses" ( observed_generation INT NOT NULL DEFAULT 0, phase VARCHAR(32) NOT NULL DEFAULT 'Pending', last_error TEXT, - last_transition_at TIMESTAMP(0), + last_transition_at TIMESTAMPTZ(0), conditions_json TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -1174,12 +1174,12 @@ CREATE TABLE IF NOT EXISTS "FogPlatformReconcileTasks" ( spec_generation INT, status VARCHAR(32) NOT NULL DEFAULT 'pending', leader_uuid VARCHAR(36), - claimed_at TIMESTAMP(0), - next_attempt_at TIMESTAMP(0), + claimed_at TIMESTAMPTZ(0), + next_attempt_at TIMESTAMPTZ(0), attempts INT NOT NULL DEFAULT 0, last_error TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0), FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE ); @@ -1194,22 +1194,34 @@ CREATE TABLE IF NOT EXISTS "ServicePlatformReconcileTasks" ( spec_snapshot TEXT, status VARCHAR(32) NOT NULL DEFAULT 'pending', leader_uuid VARCHAR(36), - claimed_at TIMESTAMP(0), - next_attempt_at TIMESTAMP(0), + claimed_at TIMESTAMPTZ(0), + next_attempt_at TIMESTAMPTZ(0), attempts INT NOT NULL DEFAULT 0, last_error TEXT, - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); CREATE UNIQUE INDEX idx_service_platform_reconcile_tasks_active_service_name ON "ServicePlatformReconcileTasks" (service_name) WHERE status IN ('pending', 'in_progress'); CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON "ServicePlatformReconcileTasks" (status, claimed_at); CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON "ServicePlatformReconcileTasks" (next_attempt_at); +CREATE TABLE IF NOT EXISTS "ReconcileOutbox" ( + id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, + kind VARCHAR(32) NOT NULL, + payload TEXT NOT NULL, + idempotency_key VARCHAR(255) NOT NULL UNIQUE, + created_at TIMESTAMPTZ(0), + processed_at TIMESTAMPTZ(0), + last_error TEXT +); + +CREATE INDEX idx_reconcile_outbox_unprocessed ON "ReconcileOutbox" (processed_at, id); + CREATE TABLE IF NOT EXISTS "HubRouterConfigLocks" ( id INT PRIMARY KEY NOT NULL CHECK (id = 1), leader_uuid VARCHAR(36), - claimed_at TIMESTAMP(0), - created_at TIMESTAMP(0), - updated_at TIMESTAMP(0) + claimed_at TIMESTAMPTZ(0), + created_at TIMESTAMPTZ(0), + updated_at TIMESTAMPTZ(0) ); diff --git a/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql b/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql index 9792e096..bf946f3a 100644 --- a/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql +++ b/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql @@ -118,7 +118,7 @@ CREATE TABLE IF NOT EXISTS Fogs ( deployment_type VARCHAR(36), active_volume_mounts BIGINT DEFAULT 0, volume_mount_last_update BIGINT DEFAULT 0, - warning_message TEXT DEFAULT 'HEALTHY', + warning_message TEXT, gps_device VARCHAR(36), gps_scan_frequency INT DEFAULT 60, edge_guard_frequency INT DEFAULT 0, @@ -737,7 +737,7 @@ CREATE INDEX idx_fog_log_status_session_id ON FogLogStatuses (session_id); CREATE TABLE IF NOT EXISTS RbacRoles ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'Role', + kind TEXT, created_at DATETIME, updated_at DATETIME ); @@ -757,7 +757,7 @@ CREATE TABLE IF NOT EXISTS RbacRoleRules ( CREATE TABLE IF NOT EXISTS RbacRoleBindings ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, name TEXT UNIQUE NOT NULL, - kind TEXT DEFAULT 'RoleBinding', + kind TEXT, role_ref TEXT NOT NULL, subjects TEXT NOT NULL, created_at DATETIME, @@ -1205,6 +1205,18 @@ CREATE UNIQUE INDEX idx_service_platform_reconcile_tasks_active_service_name ON CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON ServicePlatformReconcileTasks (status, claimed_at); CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON ServicePlatformReconcileTasks (next_attempt_at); +CREATE TABLE IF NOT EXISTS ReconcileOutbox ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + kind VARCHAR(32) NOT NULL, + payload TEXT NOT NULL, + idempotency_key VARCHAR(255) NOT NULL UNIQUE, + created_at DATETIME, + processed_at DATETIME, + last_error TEXT +); + +CREATE INDEX idx_reconcile_outbox_unprocessed ON ReconcileOutbox (processed_at, id); + CREATE TABLE IF NOT EXISTS HubRouterConfigLocks ( id INTEGER PRIMARY KEY NOT NULL CHECK (id = 1), leader_uuid VARCHAR(36), diff --git a/src/data/models/configMap.js b/src/data/models/configMap.js index 63a2c088..db766352 100644 --- a/src/data/models/configMap.js +++ b/src/data/models/configMap.js @@ -1,6 +1,11 @@ 'use strict' const SecretHelper = require('../../helpers/secret-helper') +const { + scheduleVaultPromoteAfterCommit, + shouldDeferVaultStore +} = require('../../helpers/vault-transaction-helper') +const models = require('../models') module.exports = (sequelize, DataTypes) => { const ConfigMap = sequelize.define('ConfigMap', { @@ -59,21 +64,35 @@ module.exports = (sequelize, DataTypes) => { } ], hooks: { - beforeSave: async (configMap) => { + beforeSave: async (configMap, options) => { if (configMap.changed('data')) { - // Get useVault value - prioritize getDataValue (for updates), then property, default to true let useVault = configMap.getDataValue('useVault') - // If getDataValue returns undefined/null, try the property (for new instances) if (useVault === undefined || useVault === null) { useVault = configMap.useVault !== undefined && configMap.useVault !== null ? configMap.useVault : true } - // Ensure boolean type useVault = Boolean(useVault) + const plainData = configMap.data + const transaction = options.transaction + + if (transaction && shouldDeferVaultStore('configmap', useVault)) { + configMap.data = await SecretHelper.encryptSecretInternal(plainData, configMap.name) + scheduleVaultPromoteAfterCommit(transaction, { + secretData: plainData, + secretName: configMap.name, + secretType: 'configmap', + useVault, + model: () => models.ConfigMap, + where: { name: configMap.name }, + field: 'data' + }) + return + } + const encryptedData = await SecretHelper.encryptSecret( - configMap.data, + plainData, configMap.name, 'configmap', useVault diff --git a/src/data/models/index.js b/src/data/models/index.js index a6f323fa..ccfa2a9c 100644 --- a/src/data/models/index.js +++ b/src/data/models/index.js @@ -78,8 +78,11 @@ db.initDB = async (isStart) => { // Initialize RBAC cache version if it doesn't exist try { const RbacCacheVersionManager = require('../managers/rbac-cache-version-manager') - const fakeTransaction = { fakeTransaction: true } - await RbacCacheVersionManager.initializeVersion(fakeTransaction) + const { runInTransaction } = require('../../helpers/transaction-runner') + await runInTransaction( + (transaction) => RbacCacheVersionManager.initializeVersion(transaction), + { label: 'init-rbac-cache-version' } + ) logger.info('RBAC cache version initialized') } catch (error) { logger.warn(`Failed to initialize RBAC cache version: ${error.message}. Continuing...`) @@ -94,12 +97,26 @@ db.initDB = async (isStart) => { // Initialize controller UUID try { const ClusterControllerService = require('../../services/cluster-controller-service') - const fakeTransaction = { fakeTransaction: true } - await ClusterControllerService.initializeControllerUuid(fakeTransaction) + const { runInTransaction } = require('../../helpers/transaction-runner') + await runInTransaction( + (transaction) => ClusterControllerService.initializeControllerUuid(transaction), + { label: 'init-controller-uuid' } + ) logger.info('Controller UUID initialized') } catch (error) { logger.warn(`Failed to initialize controller UUID: ${error.message}. Continuing...`) } + + const { initDbMetrics } = require('../../helpers/db-metrics') + const { getProviderName, getWriteQueueDepth } = require('../../helpers/transaction-runner') + initDbMetrics(databaseProvider.sequelize, getProviderName(), { getWriteQueueDepth }) + + try { + const { checkSqliteFogCountWarning } = require('../../helpers/sqlite-fog-warning') + await checkSqliteFogCountWarning() + } catch (error) { + logger.warn(`Failed sqlite fog count warning check: ${error.message}. Continuing...`) + } } } diff --git a/src/data/models/microserviceExecSession.js b/src/data/models/microserviceExecSession.js index 5dfb01da..e9d9e59c 100644 --- a/src/data/models/microserviceExecSession.js +++ b/src/data/models/microserviceExecSession.js @@ -14,7 +14,7 @@ module.exports = (sequelize, DataTypes) => { allowNull: false }, sessionId: { - type: DataTypes.TEXT, + type: DataTypes.STRING(255), field: 'session_id', allowNull: false, unique: true diff --git a/src/data/models/reconcileOutbox.js b/src/data/models/reconcileOutbox.js new file mode 100644 index 00000000..0e1fd746 --- /dev/null +++ b/src/data/models/reconcileOutbox.js @@ -0,0 +1,54 @@ +'use strict' + +const RECONCILE_OUTBOX_KINDS = ['nats', 'fog_platform', 'service_platform'] + +module.exports = (sequelize, DataTypes) => { + const ReconcileOutbox = sequelize.define('ReconcileOutbox', { + id: { + type: DataTypes.INTEGER, + primaryKey: true, + autoIncrement: true, + allowNull: false, + field: 'id' + }, + kind: { + type: DataTypes.STRING(32), + allowNull: false, + field: 'kind', + validate: { + isIn: [RECONCILE_OUTBOX_KINDS] + } + }, + payload: { + type: DataTypes.TEXT, + allowNull: false, + field: 'payload' + }, + idempotencyKey: { + type: DataTypes.STRING(255), + allowNull: false, + unique: true, + field: 'idempotency_key' + }, + processedAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'processed_at' + }, + lastError: { + type: DataTypes.TEXT, + allowNull: true, + field: 'last_error' + } + }, { + tableName: 'ReconcileOutbox', + timestamps: true, + createdAt: 'created_at', + updatedAt: false, + underscored: true + }) + + return ReconcileOutbox +} + +module.exports.RECONCILE_OUTBOX_KINDS = RECONCILE_OUTBOX_KINDS diff --git a/src/data/models/registry.js b/src/data/models/registry.js index 93f05c8d..5a721d2c 100644 --- a/src/data/models/registry.js +++ b/src/data/models/registry.js @@ -1,6 +1,11 @@ 'use strict' const SecretHelper = require('../../helpers/secret-helper') +const { + scheduleVaultPromoteAfterCommit, + shouldDeferVaultStore +} = require('../../helpers/vault-transaction-helper') +const models = require('../models') // Minimum length for internal encryption format: base64(salt(16) + iv(12) + tag(16) + encrypted) const INTERNAL_ENCRYPTED_MIN_LENGTH = 60 @@ -63,7 +68,7 @@ module.exports = (sequelize, DataTypes) => { timestamps: false, underscored: true, hooks: { - beforeSave: async (registry) => { + beforeSave: async (registry, options) => { if (!registry.changed('password')) return const password = registry.password if (isPasswordEmpty(password)) { @@ -76,9 +81,27 @@ module.exports = (sequelize, DataTypes) => { if (SecretHelper.isVaultReference(password) || looksLikeInternalEncrypted(password)) { return } + + const transaction = options.transaction + const secretName = 'registry-' + registry.id + const secretData = { value: password } + + if (transaction && shouldDeferVaultStore('registry')) { + registry.password = await SecretHelper.encryptSecretInternal(secretData, secretName) + scheduleVaultPromoteAfterCommit(transaction, { + secretData, + secretName, + secretType: 'registry', + model: () => models.Registry, + where: { id: registry.id }, + field: 'password' + }) + return + } + const encrypted = await SecretHelper.encryptSecret( - { value: password }, - 'registry-' + registry.id, + secretData, + secretName, 'registry' ) registry.password = encrypted diff --git a/src/data/models/secret.js b/src/data/models/secret.js index 8bfd3ad2..978c8b70 100644 --- a/src/data/models/secret.js +++ b/src/data/models/secret.js @@ -1,6 +1,11 @@ 'use strict' const SecretHelper = require('../../helpers/secret-helper') +const { + scheduleVaultPromoteAfterCommit, + shouldDeferVaultStore +} = require('../../helpers/vault-transaction-helper') +const models = require('../models') module.exports = (sequelize, DataTypes) => { const Secret = sequelize.define('Secret', { @@ -57,10 +62,26 @@ module.exports = (sequelize, DataTypes) => { } ], hooks: { - beforeSave: async (secret) => { + beforeSave: async (secret, options) => { if (secret.changed('data')) { + const plainData = secret.data + const transaction = options.transaction + + if (transaction && shouldDeferVaultStore(secret.type)) { + secret.data = await SecretHelper.encryptSecretInternal(plainData, secret.name) + scheduleVaultPromoteAfterCommit(transaction, { + secretData: plainData, + secretName: secret.name, + secretType: secret.type, + model: () => models.Secret, + where: { name: secret.name }, + field: 'data' + }) + return + } + const encryptedData = await SecretHelper.encryptSecret( - secret.data, + plainData, secret.name, secret.type ) diff --git a/src/data/providers/mysql.js b/src/data/providers/mysql.js index fe4260b6..2c5fb5d4 100644 --- a/src/data/providers/mysql.js +++ b/src/data/providers/mysql.js @@ -36,6 +36,7 @@ class MySqlDatabaseProvider extends DatabaseProvider { } // Sequelize configuration + const poolConfig = mysqlConfig.pool || {} const sequelizeConfig = { dialect: 'mysql', host: connectionOptions.host, @@ -46,6 +47,12 @@ class MySqlDatabaseProvider extends DatabaseProvider { dialectOptions: { connectTimeout: connectionOptions.connectTimeout }, + pool: { + max: poolConfig.max != null ? poolConfig.max : 10, + min: poolConfig.min != null ? poolConfig.min : 0, + idle: poolConfig.idle != null ? poolConfig.idle : 20000 + }, + timezone: '+00:00', logging: false } diff --git a/src/data/providers/postgres.js b/src/data/providers/postgres.js index 1fdbeb0f..f351c312 100644 --- a/src/data/providers/postgres.js +++ b/src/data/providers/postgres.js @@ -36,6 +36,7 @@ class PostgresDatabaseProvider extends DatabaseProvider { } // Sequelize configuration + const poolConfig = postgresConfig.pool || {} const sequelizeConfig = { dialect: 'postgres', host: connectionOptions.host, @@ -46,6 +47,12 @@ class PostgresDatabaseProvider extends DatabaseProvider { dialectOptions: { connectTimeout: connectionOptions.connectTimeout }, + pool: { + max: poolConfig.max != null ? poolConfig.max : 10, + min: poolConfig.min != null ? poolConfig.min : 0, + idle: poolConfig.idle != null ? poolConfig.idle : 20000 + }, + timezone: '+00:00', logging: false } // Add SSL configuration to Sequelize if enabled diff --git a/src/data/stores/sequelize-session-store.js b/src/data/stores/sequelize-session-store.js index d54359ee..dc6f9b6e 100644 --- a/src/data/stores/sequelize-session-store.js +++ b/src/data/stores/sequelize-session-store.js @@ -37,7 +37,7 @@ class SequelizeSessionStore extends Store { sid, data, expiresAt - }) + }, { conflictFields: ['sid'] }) .then(() => callback(null)) .catch((error) => callback(error)) } From 8f9212ebaebc9616a56d28b2c5338c19f3e2c845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:37:47 +0300 Subject: [PATCH 04/32] Add ReconcileOutbox transactional outbox with background drainer. Enqueue fog, service, and NATS reconcile work atomically with mutations; use FOR UPDATE SKIP LOCKED task claims on mysql/postgres. --- .../fog-platform-reconcile-task-manager.js | 77 +++-- .../managers/nats-reconcile-task-manager.js | 53 ++-- src/data/managers/reconcile-outbox-manager.js | 151 ++++++++++ ...service-platform-reconcile-task-manager.js | 60 +--- src/helpers/job-startup.js | 3 +- src/helpers/reconcile-outbox-keys.js | 54 ++++ src/jobs/fog-platform-sweep-job.js | 270 +++++++++++++----- src/jobs/platform-reconcile-worker-job.js | 63 +++- src/jobs/reconcile-outbox-drainer-job.js | 78 +++++ test/src/data/reconcile-outbox.test.js | 187 ++++++++++++ test/src/data/reconcile-task-claim-ha.test.js | 183 ++++++++++++ .../src/helpers/reconcile-outbox-keys.test.js | 56 ++++ 12 files changed, 1020 insertions(+), 215 deletions(-) create mode 100644 src/data/managers/reconcile-outbox-manager.js create mode 100644 src/helpers/reconcile-outbox-keys.js create mode 100644 src/jobs/reconcile-outbox-drainer-job.js create mode 100644 test/src/data/reconcile-outbox.test.js create mode 100644 test/src/data/reconcile-task-claim-ha.test.js create mode 100644 test/src/helpers/reconcile-outbox-keys.test.js diff --git a/src/data/managers/fog-platform-reconcile-task-manager.js b/src/data/managers/fog-platform-reconcile-task-manager.js index 820b655c..8fbcee3f 100644 --- a/src/data/managers/fog-platform-reconcile-task-manager.js +++ b/src/data/managers/fog-platform-reconcile-task-manager.js @@ -1,13 +1,18 @@ const BaseManager = require('./base-manager') const models = require('../models') const config = require('../../config') -const databaseProvider = require('../providers/database-factory') const { Op } = require('sequelize') const { FOG_PLATFORM_REASONS } = require('../../schemas/fog-platform-spec') const { withDbBusyRetry } = require('../../helpers/db-busy-retry') +const { claimNextReconcileTask } = require('../../helpers/db-dialect') const ACTIVE_STATUSES = ['pending', 'in_progress'] +const FOG_TASK_SELECT_SQL = `SELECT id, fog_uuid AS fogUuid, reason, spec_generation AS specGeneration, + status, leader_uuid AS leaderUuid, claimed_at AS claimedAt, next_attempt_at AS nextAttemptAt, + attempts, last_error AS lastError, created_at AS createdAt, updated_at AS updatedAt + FROM :table` + class FogPlatformReconcileTaskManager extends BaseManager { getEntity () { return models.FogPlatformReconcileTask @@ -18,12 +23,6 @@ class FogPlatformReconcileTaskManager extends BaseManager { } async enqueueFogPlatformReconcileTask (options = {}, transaction) { - if (transaction.fakeTransaction) { - return databaseProvider.sequelize.transaction((t) => - this.enqueueFogPlatformReconcileTask(options, t) - ) - } - const fogUuid = options.fogUuid if (!fogUuid) { throw new Error('fogUuid is required to enqueue fog platform reconcile task') @@ -42,6 +41,20 @@ class FogPlatformReconcileTaskManager extends BaseManager { }) if (existing) { + if (reason === 'delete' && existing.status === 'in_progress') { + await Entity.update({ + reason: 'delete', + specGeneration, + status: 'pending', + leaderUuid: null, + claimedAt: null, + nextAttemptAt: null, + attempts: 0, + lastError: null + }, { where: { id: existing.id }, transaction }) + return this.findOne({ id: existing.id }, transaction) + } + const update = { specGeneration } if (reason === 'delete' || existing.reason !== 'delete') { update.reason = reason @@ -68,50 +81,24 @@ class FogPlatformReconcileTaskManager extends BaseManager { } async _claimNextFogTaskInternal (controllerUuid, stalenessSeconds) { - const sequelize = databaseProvider.sequelize const T = stalenessSeconds != null ? stalenessSeconds : config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) + const deleteT = config.get('settings.fogPlatformDeleteReconcileTaskStalenessSeconds', 60) const staleThreshold = new Date(Date.now() - T * 1000) - const Entity = this.getEntity() + const deleteStaleThreshold = new Date(Date.now() - deleteT * 1000) const now = new Date() - return sequelize.transaction(async (transaction) => { - const task = await Entity.findOne({ - where: { - status: { [Op.in]: ACTIVE_STATUSES }, - [Op.or]: [ - { nextAttemptAt: null }, - { nextAttemptAt: { [Op.lte]: now } } - ], - [Op.and]: [{ - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }] - }, - order: [['id', 'ASC']], - limit: 1, - transaction - }) - if (!task) return null - - const [affected] = await Entity.update( - { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, - { - where: { - id: task.id, - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }, - transaction - } - ) - if (affected === 0) return null - return this.findOne({ id: task.id }, transaction) + return claimNextReconcileTask({ + Entity: this.getEntity(), + controllerUuid, + staleThreshold, + deleteStaleThreshold, + now, + activeStatuses: ACTIVE_STATUSES, + includeNextAttemptFilter: true, + selectSql: FOG_TASK_SELECT_SQL, + reloadTask: (id, transaction) => this.findOne({ id }, transaction) }) } diff --git a/src/data/managers/nats-reconcile-task-manager.js b/src/data/managers/nats-reconcile-task-manager.js index 89569e1b..3df3adfb 100644 --- a/src/data/managers/nats-reconcile-task-manager.js +++ b/src/data/managers/nats-reconcile-task-manager.js @@ -1,9 +1,16 @@ const BaseManager = require('./base-manager') const models = require('../models') const config = require('../../config') -const databaseProvider = require('../providers/database-factory') -const { Op } = require('sequelize') const { withDbBusyRetry } = require('../../helpers/db-busy-retry') +const { claimNextReconcileTask } = require('../../helpers/db-dialect') + +const ACTIVE_STATUSES = ['pending', 'in_progress'] + +const NATS_TASK_SELECT_SQL = `SELECT id, reason, application_id AS applicationId, + account_rule_id AS accountRuleId, user_rule_id AS userRuleId, fog_uuids AS fogUuids, + status, leader_uuid AS leaderUuid, claimed_at AS claimedAt, + created_at AS createdAt, updated_at AS updatedAt + FROM :table` class NatsReconcileTaskManager extends BaseManager { getEntity () { @@ -15,39 +22,19 @@ class NatsReconcileTaskManager extends BaseManager { } async _claimNextInternal (controllerUuid, stalenessSeconds) { - const sequelize = databaseProvider.sequelize const T = stalenessSeconds != null ? stalenessSeconds : config.get('settings.natsReconcileTaskStalenessSeconds', 900) const staleThreshold = new Date(Date.now() - T * 1000) - const Entity = this.getEntity() - return sequelize.transaction(async (transaction) => { - const task = await Entity.findOne({ - where: { - status: { [Op.in]: ['pending', 'in_progress'] }, - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }, - order: [['id', 'ASC']], - limit: 1, - transaction - }) - if (!task) return null - const [affected] = await Entity.update( - { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, - { - where: { - id: task.id, - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }, - transaction - } - ) - if (affected === 0) return null - return this.findOne({ id: task.id }, transaction) + const now = new Date() + + return claimNextReconcileTask({ + Entity: this.getEntity(), + controllerUuid, + staleThreshold, + now, + activeStatuses: ACTIVE_STATUSES, + includeNextAttemptFilter: false, + selectSql: NATS_TASK_SELECT_SQL, + reloadTask: (id, transaction) => this.findOne({ id }, transaction) }) } } diff --git a/src/data/managers/reconcile-outbox-manager.js b/src/data/managers/reconcile-outbox-manager.js new file mode 100644 index 00000000..5b156402 --- /dev/null +++ b/src/data/managers/reconcile-outbox-manager.js @@ -0,0 +1,151 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const { buildIdempotencyKey } = require('../../helpers/reconcile-outbox-keys') + +class ReconcileOutboxManager extends BaseManager { + getEntity () { + return models.ReconcileOutbox + } + + _isUniqueConstraintError (error) { + return error && error.name === 'SequelizeUniqueConstraintError' + } + + _serializePayload (payload) { + return JSON.stringify(payload) + } + + _parsePayload (row) { + if (!row || row.payload == null) { + return null + } + return JSON.parse(row.payload) + } + + async _reopenProcessedRow (existing, kind, serializedPayload, transaction) { + await this.update({ id: existing.id }, { + kind, + payload: serializedPayload, + processedAt: null, + lastError: null + }, transaction) + return this.findOne({ id: existing.id }, transaction) + } + + async _resolveExistingEnqueue (existing, kind, serializedPayload, transaction) { + if (existing.processedAt == null) { + return existing + } + return this._reopenProcessedRow(existing, kind, serializedPayload, transaction) + } + + async enqueue (kind, payload, idempotencyKey, transaction) { + const serializedPayload = this._serializePayload(payload) + const existing = await this.findOne({ idempotencyKey }, transaction) + if (existing) { + return this._resolveExistingEnqueue(existing, kind, serializedPayload, transaction) + } + + const sequelize = models.sequelize + const useSavepoint = sequelize.getDialect() === 'postgres' + const savepointName = useSavepoint ? `sp_outbox_${Math.random().toString(36).slice(2, 10)}` : null + + try { + if (useSavepoint) { + await sequelize.query(`SAVEPOINT ${savepointName}`, { transaction }) + } + const row = await this.create({ + kind, + payload: serializedPayload, + idempotencyKey, + processedAt: null, + lastError: null + }, transaction) + if (useSavepoint) { + await sequelize.query(`RELEASE SAVEPOINT ${savepointName}`, { transaction }) + } + return row + } catch (error) { + if (this._isUniqueConstraintError(error)) { + if (useSavepoint) { + await sequelize.query(`ROLLBACK TO SAVEPOINT ${savepointName}`, { transaction }) + await sequelize.query(`RELEASE SAVEPOINT ${savepointName}`, { transaction }) + } + const raced = await this.findOne({ idempotencyKey }, transaction) + if (raced) { + return this._resolveExistingEnqueue(raced, kind, serializedPayload, transaction) + } + } + throw error + } + } + + async enqueueFogPlatform (payload, transaction) { + const idempotencyKey = buildIdempotencyKey('fog_platform', payload) + return this.enqueue('fog_platform', payload, idempotencyKey, transaction) + } + + async enqueueServicePlatform (payload, transaction) { + const idempotencyKey = buildIdempotencyKey('service_platform', payload) + return this.enqueue('service_platform', payload, idempotencyKey, transaction) + } + + async enqueueNats (payload, transaction) { + if (payload && payload.triggerReconcile === false) { + return null + } + const { triggerReconcile, ...rest } = payload || {} + const idempotencyKey = buildIdempotencyKey('nats', rest) + return this.enqueue('nats', rest, idempotencyKey, transaction) + } + + async claimUnprocessed (limit, transaction) { + const sequelize = models.sequelize + const dialect = sequelize.getDialect() + const Entity = this.getEntity() + const safeLimit = Math.max(1, limit || 1) + + if (dialect === 'sqlite') { + return Entity.findAll({ + where: { processedAt: null }, + order: [['id', 'ASC']], + limit: safeLimit, + transaction + }) + } + + const tableName = Entity.getTableName() + const quotedTable = dialect === 'postgres' ? `"${tableName}"` : `\`${tableName}\`` + const rows = await sequelize.query( + `SELECT id, kind, payload, idempotency_key AS idempotencyKey, created_at AS createdAt, processed_at AS processedAt, last_error AS lastError + FROM ${quotedTable} + WHERE processed_at IS NULL + ORDER BY id ASC + LIMIT :limit + FOR UPDATE SKIP LOCKED`, + { + replacements: { limit: safeLimit }, + type: sequelize.QueryTypes.SELECT, + transaction + } + ) + + return rows.map((row) => Entity.build(row, { isNewRecord: false })) + } + + async markProcessed (id, transaction) { + await this.update({ id }, { processedAt: new Date(), lastError: null }, transaction) + return this.findOne({ id }, transaction) + } + + async markFailed (id, errorMessage, transaction) { + await this.update({ id }, { lastError: errorMessage }, transaction) + return this.findOne({ id }, transaction) + } + + parsePayload (row) { + return this._parsePayload(row) + } +} + +module.exports = new ReconcileOutboxManager() diff --git a/src/data/managers/service-platform-reconcile-task-manager.js b/src/data/managers/service-platform-reconcile-task-manager.js index 7bd91105..953a9d76 100644 --- a/src/data/managers/service-platform-reconcile-task-manager.js +++ b/src/data/managers/service-platform-reconcile-task-manager.js @@ -1,7 +1,6 @@ const BaseManager = require('./base-manager') const models = require('../models') const config = require('../../config') -const databaseProvider = require('../providers/database-factory') const { Op } = require('sequelize') const { SERVICE_PLATFORM_REASONS, @@ -9,9 +8,15 @@ const { parseSpecSnapshot } = require('../../schemas/fog-platform-spec') const { withDbBusyRetry } = require('../../helpers/db-busy-retry') +const { claimNextReconcileTask } = require('../../helpers/db-dialect') const ACTIVE_STATUSES = ['pending', 'in_progress'] +const SERVICE_TASK_SELECT_SQL = `SELECT id, service_name AS serviceName, reason, spec_snapshot AS specSnapshot, + status, leader_uuid AS leaderUuid, claimed_at AS claimedAt, next_attempt_at AS nextAttemptAt, + attempts, last_error AS lastError, created_at AS createdAt, updated_at AS updatedAt + FROM :table` + class ServicePlatformReconcileTaskManager extends BaseManager { getEntity () { return models.ServicePlatformReconcileTask @@ -29,12 +34,6 @@ class ServicePlatformReconcileTaskManager extends BaseManager { } async enqueueServicePlatformReconcileTask (options = {}, transaction) { - if (transaction.fakeTransaction) { - return databaseProvider.sequelize.transaction((t) => - this.enqueueServicePlatformReconcileTask(options, t) - ) - } - const serviceName = options.serviceName if (!serviceName) { throw new Error('serviceName is required to enqueue service platform reconcile task') @@ -76,50 +75,21 @@ class ServicePlatformReconcileTaskManager extends BaseManager { } async _claimNextServiceTaskInternal (controllerUuid, stalenessSeconds) { - const sequelize = databaseProvider.sequelize const T = stalenessSeconds != null ? stalenessSeconds : config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) const staleThreshold = new Date(Date.now() - T * 1000) - const Entity = this.getEntity() const now = new Date() - return sequelize.transaction(async (transaction) => { - const task = await Entity.findOne({ - where: { - status: { [Op.in]: ACTIVE_STATUSES }, - [Op.or]: [ - { nextAttemptAt: null }, - { nextAttemptAt: { [Op.lte]: now } } - ], - [Op.and]: [{ - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }] - }, - order: [['id', 'ASC']], - limit: 1, - transaction - }) - if (!task) return null - - const [affected] = await Entity.update( - { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, - { - where: { - id: task.id, - [Op.or]: [ - { leaderUuid: null }, - { claimedAt: { [Op.lt]: staleThreshold } } - ] - }, - transaction - } - ) - if (affected === 0) return null - return this.findOne({ id: task.id }, transaction) + return claimNextReconcileTask({ + Entity: this.getEntity(), + controllerUuid, + staleThreshold, + now, + activeStatuses: ACTIVE_STATUSES, + includeNextAttemptFilter: true, + selectSql: SERVICE_TASK_SELECT_SQL, + reloadTask: (id, transaction) => this.findOne({ id }, transaction) }) } diff --git a/src/helpers/job-startup.js b/src/helpers/job-startup.js index a764b869..97650a3f 100644 --- a/src/helpers/job-startup.js +++ b/src/helpers/job-startup.js @@ -5,7 +5,8 @@ const RECONCILE_HEAVY_JOBS = new Set([ 'platform-reconcile-worker-job.js', 'nats-reconcile-worker-job.js', 'fog-platform-sweep-job.js', - 'fog-status-job.js' + 'fog-status-job.js', + 'reconcile-outbox-drainer-job.js' ]) const JOB_STAGGER_MS = 500 diff --git a/src/helpers/reconcile-outbox-keys.js b/src/helpers/reconcile-outbox-keys.js new file mode 100644 index 00000000..43f53467 --- /dev/null +++ b/src/helpers/reconcile-outbox-keys.js @@ -0,0 +1,54 @@ +const crypto = require('crypto') + +function stableHash (value) { + return crypto.createHash('sha256').update(JSON.stringify(value)).digest('hex').slice(0, 16) +} + +function buildFogPlatformIdempotencyKey (payload = {}) { + const { fogUuid, reason, specGeneration } = payload + return `fp:${fogUuid}:${reason}:${specGeneration != null ? specGeneration : 'null'}` +} + +function buildServicePlatformIdempotencyKey (payload = {}) { + const { serviceName, reason, specSnapshot } = payload + const snapshotPart = specSnapshot != null ? stableHash(specSnapshot) : 'null' + return `sp:${serviceName}:${reason}:${snapshotPart}` +} + +function buildNatsIdempotencyKey (payload = {}) { + const { + reason, + applicationId, + accountRuleId, + userRuleId, + fogUuids + } = payload + + if (Array.isArray(fogUuids) && fogUuids.length > 0) { + const sorted = [...fogUuids].sort().join(',') + return `nats:${reason}:${applicationId ?? 'null'}:${accountRuleId ?? 'null'}:${userRuleId ?? 'null'}:${sorted}` + } + + return `nats:${reason}:${applicationId ?? 'null'}:${accountRuleId ?? 'null'}:${userRuleId ?? 'null'}` +} + +function buildIdempotencyKey (kind, payload = {}) { + switch (kind) { + case 'fog_platform': + return buildFogPlatformIdempotencyKey(payload) + case 'service_platform': + return buildServicePlatformIdempotencyKey(payload) + case 'nats': + return buildNatsIdempotencyKey(payload) + default: + throw new Error(`Unknown reconcile outbox kind: ${kind}`) + } +} + +module.exports = { + stableHash, + buildFogPlatformIdempotencyKey, + buildServicePlatformIdempotencyKey, + buildNatsIdempotencyKey, + buildIdempotencyKey +} diff --git a/src/jobs/fog-platform-sweep-job.js b/src/jobs/fog-platform-sweep-job.js index b0a37ad8..975f7f6e 100644 --- a/src/jobs/fog-platform-sweep-job.js +++ b/src/jobs/fog-platform-sweep-job.js @@ -9,10 +9,10 @@ const ServiceManager = require('../data/managers/service-manager') const RouterManager = require('../data/managers/router-manager') const NatsInstanceManager = require('../data/managers/nats-instance-manager') const TransactionDecorator = require('../decorators/transaction-decorator') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const IofogService = require('../services/iofog-service') const ServicesService = require('../services/services-service') const K8sClient = require('../utils/k8s-client') -const databaseProvider = require('../data/providers/database-factory') const Config = require('../config') const logger = require('../logger') @@ -31,71 +31,182 @@ async function run () { } } -async function runSweepInternal (transaction) { - const uuid = ClusterControllerService.getCurrentControllerUuid() - if (!uuid) { - return { fogEnqueued: 0, serviceEnqueued: 0 } +async function fetchK8sRouterConfig () { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap || !configMap.data || !configMap.data['skrouterd.json']) { + return null } + return JSON.parse(configMap.data['skrouterd.json']) +} - const execute = async (t) => { - let fogEnqueued = 0 - let serviceEnqueued = 0 +function hasK8sServiceHubDrift (service, k8sRouterConfig) { + const listenerName = `${service.name}-listener` + if (!k8sRouterConfig) { + return true + } + return !k8sRouterConfig.some((entry) => + entry[0] === 'tcpListener' && entry[1] && entry[1].name === listenerName + ) +} - const specs = await FogPlatformSpecManager.findAll({}, t) - for (const specRow of specs) { - const shouldEnqueue = await shouldEnqueueFogSweepInternal(specRow.fogUuid, t) - if (!shouldEnqueue) { - continue - } +function hasNonK8sServiceHubDrift (service, defaultRouterConfig) { + if (!defaultRouterConfig) { + return false + } + const listenerName = `${service.name}-listener` + const listeners = defaultRouterConfig.bridges?.tcpListeners || {} + return !listeners[listenerName] +} - const parsedSpec = await FogPlatformSpecManager.getParsedSpec(specRow.fogUuid, t) - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ - fogUuid: specRow.fogUuid, - reason: 'periodic-sweep', - specGeneration: parsedSpec ? parsedSpec.generation : specRow.generation - }, t) - fogEnqueued += 1 +async function evaluateSweepCandidates (transaction) { + const fogCandidates = [] + const failedServiceCandidates = [] + const readyServicesForDrift = [] + let needsK8sDriftCheck = false + let defaultRouterConfig = null + + const specs = await FogPlatformSpecManager.findAll({}, transaction) + for (const specRow of specs) { + if (!(await shouldEnqueueFogSweepInternal(specRow.fogUuid, transaction))) { + continue } + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(specRow.fogUuid, transaction) + fogCandidates.push({ + fogUuid: specRow.fogUuid, + specGeneration: parsedSpec ? parsedSpec.generation : specRow.generation + }) + } - const services = await ServiceManager.findAllWithTags({}, t) - for (const service of services) { - const shouldEnqueue = await shouldEnqueueServiceSweepInternal(service, t) - if (!shouldEnqueue) { - continue + const services = await ServiceManager.findAllWithTags({}, transaction) + for (const service of services) { + if (service.provisioningStatus === 'failed') { + if (await shouldEnqueueFailedServiceSweep(service, transaction)) { + failedServiceCandidates.push(service) } + continue + } - const specSnapshot = { - name: service.name, - type: service.type, - resource: service.resource, - defaultBridge: service.defaultBridge, - bridgePort: service.bridgePort, - targetPort: service.targetPort, - servicePort: service.servicePort, - k8sType: service.k8sType, - serviceEndpoint: service.serviceEndpoint, - tags: (service.tags || []).map((tag) => (typeof tag === 'string' ? tag : tag.value)) + if (service.provisioningStatus === 'ready') { + readyServicesForDrift.push(service) + } + } + + if (readyServicesForDrift.length > 0) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + if (isK8s) { + needsK8sDriftCheck = true + } else { + try { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (defaultRouter) { + defaultRouterConfig = await IofogService._getRouterMicroserviceConfig(defaultRouter.iofogUuid, transaction) + } + } catch (error) { + defaultRouterConfig = null } - await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ - serviceName: service.name, - reason: 'periodic-sweep', - specSnapshot - }, t) - serviceEnqueued += 1 } + } + + return { + fogCandidates, + failedServiceCandidates, + readyServicesForDrift, + needsK8sDriftCheck, + defaultRouterConfig + } +} + +async function persistSweepEnqueue (evaluation, k8sRouterConfig, isK8s, transaction) { + let fogEnqueued = 0 + let serviceEnqueued = 0 + + for (const candidate of evaluation.fogCandidates) { + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: candidate.fogUuid, + reason: 'periodic-sweep', + specGeneration: candidate.specGeneration + }, transaction) + fogEnqueued += 1 + } + + for (const service of evaluation.failedServiceCandidates) { + await enqueueServiceSweepTask(service, transaction) + serviceEnqueued += 1 + } - if (fogEnqueued > 0 || serviceEnqueued > 0) { - logger.info('Fog platform sweep enqueued reconcile tasks', { fogEnqueued, serviceEnqueued }) + for (const service of evaluation.readyServicesForDrift) { + const hasDrift = isK8s + ? hasK8sServiceHubDrift(service, k8sRouterConfig) + : hasNonK8sServiceHubDrift(service, evaluation.defaultRouterConfig) + + if (!hasDrift || await hasActiveServiceTask(service.name, transaction)) { + continue } - return { fogEnqueued, serviceEnqueued } + await enqueueServiceSweepTask(service, transaction) + serviceEnqueued += 1 + } + + if (fogEnqueued > 0 || serviceEnqueued > 0) { + logger.info('Fog platform sweep enqueued reconcile tasks', { fogEnqueued, serviceEnqueued }) + } + + return { fogEnqueued, serviceEnqueued } +} + +function buildServiceSpecSnapshot (service) { + return { + name: service.name, + type: service.type, + resource: service.resource, + defaultBridge: service.defaultBridge, + bridgePort: service.bridgePort, + targetPort: service.targetPort, + servicePort: service.servicePort, + k8sType: service.k8sType, + serviceEndpoint: service.serviceEndpoint, + tags: (service.tags || []).map((tag) => (typeof tag === 'string' ? tag : tag.value)) + } +} + +async function enqueueServiceSweepTask (service, transaction) { + await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + serviceName: service.name, + reason: 'periodic-sweep', + specSnapshot: buildServiceSpecSnapshot(service) + }, transaction) +} + +async function runSweepInternal (transaction) { + const uuid = ClusterControllerService.getCurrentControllerUuid() + if (!uuid) { + return { fogEnqueued: 0, serviceEnqueued: 0 } + } + + const evaluate = (t) => evaluateSweepCandidates(t) + const evaluation = transaction + ? await evaluate(transaction) + : await runInTransaction(evaluate, { priority: PRIORITY_BACKGROUND, label: 'fogPlatformSweep.evaluate' }) + + let k8sRouterConfig = null + const isK8s = evaluation.needsK8sDriftCheck + ? await ServicesService.checkKubernetesEnvironment() + : false + + if (evaluation.needsK8sDriftCheck && isK8s) { + try { + k8sRouterConfig = await fetchK8sRouterConfig() + } catch (error) { + logger.warn('Fog platform sweep K8s config fetch failed', { err: error }) + } } + const persist = (t) => persistSweepEnqueue(evaluation, k8sRouterConfig, isK8s, t) if (transaction) { - return execute(transaction) + return persist(transaction) } - return databaseProvider.sequelize.transaction((t) => execute(t)) + return runInTransaction(persist, { priority: PRIORITY_BACKGROUND, label: 'fogPlatformSweep.persist' }) } async function hasActiveFogTask (fogUuid, transaction) { @@ -215,23 +326,34 @@ async function hasMissingServiceBridges (fogUuid, parsedSpec, transaction) { return false } -async function hasServiceHubDrift (service, transaction) { +async function shouldEnqueueFailedServiceSweep (service, transaction) { + if (await hasActiveServiceTask(service.name, transaction)) { + const task = await ServicePlatformReconcileTaskManager.getEntity().findOne({ + where: { + serviceName: service.name, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + return task ? isBackoffElapsed(task.nextAttemptAt) : false + } + return true +} + +async function hasServiceHubDrift (service, transaction, options = {}) { if (service.provisioningStatus !== 'ready') { return false } - const listenerName = `${service.name}-listener` - const isK8s = await ServicesService.checkKubernetesEnvironment() + const isK8s = options.isK8s != null + ? options.isK8s + : await ServicesService.checkKubernetesEnvironment() if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap || !configMap.data || !configMap.data['skrouterd.json']) { - return true - } - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - return !routerConfig.some((entry) => - entry[0] === 'tcpListener' && entry[1] && entry[1].name === listenerName - ) + const k8sRouterConfig = options.k8sRouterConfig !== undefined + ? options.k8sRouterConfig + : await fetchK8sRouterConfig() + return hasK8sServiceHubDrift(service, k8sRouterConfig) } try { @@ -239,9 +361,10 @@ async function hasServiceHubDrift (service, transaction) { if (!defaultRouter) { return false } - const routerConfig = await IofogService._getRouterMicroserviceConfig(defaultRouter.iofogUuid, transaction) - const listeners = routerConfig?.bridges?.tcpListeners || {} - return !listeners[listenerName] + const routerConfig = options.defaultRouterConfig !== undefined + ? options.defaultRouterConfig + : await IofogService._getRouterMicroserviceConfig(defaultRouter.iofogUuid, transaction) + return hasNonK8sServiceHubDrift(service, routerConfig) } catch (error) { return false } @@ -281,22 +404,12 @@ async function shouldEnqueueFogSweepInternal (fogUuid, transaction) { return false } -async function shouldEnqueueServiceSweepInternal (service, transaction) { +async function shouldEnqueueServiceSweepInternal (service, transaction, options = {}) { if (service.provisioningStatus === 'failed') { - if (await hasActiveServiceTask(service.name, transaction)) { - const task = await ServicePlatformReconcileTaskManager.getEntity().findOne({ - where: { - serviceName: service.name, - status: { [Op.in]: ACTIVE_STATUSES } - }, - transaction - }) - return task ? isBackoffElapsed(task.nextAttemptAt) : false - } - return true + return shouldEnqueueFailedServiceSweep(service, transaction) } - if (await hasServiceHubDrift(service, transaction)) { + if (await hasServiceHubDrift(service, transaction, options)) { return !(await hasActiveServiceTask(service.name, transaction)) } @@ -305,7 +418,10 @@ async function shouldEnqueueServiceSweepInternal (service, transaction) { module.exports = { run, - runSweep: TransactionDecorator.generateTransaction(runSweepInternal), + runSweep: TransactionDecorator.generateTransaction(runSweepInternal, { priority: PRIORITY_BACKGROUND, label: 'fogPlatformSweep' }), shouldEnqueueFogSweep: TransactionDecorator.generateTransaction(shouldEnqueueFogSweepInternal), - shouldEnqueueServiceSweep: TransactionDecorator.generateTransaction(shouldEnqueueServiceSweepInternal) + shouldEnqueueServiceSweep: TransactionDecorator.generateTransaction(shouldEnqueueServiceSweepInternal), + fetchK8sRouterConfig, + hasK8sServiceHubDrift, + hasNonK8sServiceHubDrift } diff --git a/src/jobs/platform-reconcile-worker-job.js b/src/jobs/platform-reconcile-worker-job.js index d211e7e8..a4e93366 100644 --- a/src/jobs/platform-reconcile-worker-job.js +++ b/src/jobs/platform-reconcile-worker-job.js @@ -2,11 +2,12 @@ const ClusterControllerService = require('../services/cluster-controller-service const FogPlatformService = require('../services/fog-platform-service') const ServicePlatformService = require('../services/service-platform-service') const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') const ServiceManager = require('../data/managers/service-manager') -const databaseProvider = require('../data/providers/database-factory') const Config = require('../config') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const scheduleTime = (Config.get('settings.fogPlatformReconcileWorkerIntervalSeconds', 3)) * 1000 @@ -21,6 +22,34 @@ async function run () { } } +async function isFogDeleteReconcileTask (task, transaction) { + if (task.reason === 'delete') { + return true + } + const status = await FogPlatformStatusManager.getParsedStatus(task.fogUuid, transaction) + return !!(status && status.phase === 'Deleting') +} + +async function runFogReconcileForTask (task) { + if (task.reason === 'delete') { + return FogPlatformService.reconcileFogDelete(task.fogUuid) + } + + const status = await runInTransaction( + (transaction) => FogPlatformStatusManager.getParsedStatus(task.fogUuid, transaction), + { priority: PRIORITY_BACKGROUND, label: 'platformReconcile.fogDeleteCheck' } + ) + if (status && status.phase === 'Deleting') { + return FogPlatformService.reconcileFogDelete(task.fogUuid) + } + + const result = await FogPlatformService.reconcileFog(task.fogUuid) + if (result && result.skipped && result.reason === 'deleting') { + return FogPlatformService.reconcileFogDelete(task.fogUuid) + } + return result +} + async function processNextFogTask () { const uuid = ClusterControllerService.getCurrentControllerUuid() if (!uuid) { @@ -46,9 +75,7 @@ async function processNextFogTask () { reason: task.reason }) - const result = task.reason === 'delete' - ? await FogPlatformService.reconcileFogDelete(task.fogUuid) - : await FogPlatformService.reconcileFog(task.fogUuid) + const result = await runFogReconcileForTask(task) logger.info(`Fog platform reconcile task ${task.id} completed`, { fogUuid: task.fogUuid, @@ -56,12 +83,12 @@ async function processNextFogTask () { result }) - await databaseProvider.sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await FogPlatformReconcileTaskManager.getEntity().destroy({ where: { id: task.id }, transaction }) - }) + }, { priority: PRIORITY_BACKGROUND, label: 'platformReconcile.fogTaskComplete' }) } catch (error) { logger.error({ err: error, @@ -116,12 +143,12 @@ async function processNextServiceTask () { }) if (task.reason !== 'delete') { - await databaseProvider.sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await ServicePlatformReconcileTaskManager.getEntity().destroy({ where: { id: task.id }, transaction }) - }) + }, { priority: PRIORITY_BACKGROUND, label: 'platformReconcile.serviceTaskComplete' }) } } catch (error) { logger.error({ @@ -146,15 +173,21 @@ async function processNextServiceTask () { async function handleFogTaskFailure (task, error) { const errorMessage = error.message || String(error) - await databaseProvider.sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await FogPlatformReconcileTaskManager.recordFogTaskFailure( task.id, errorMessage, { attempts: task.attempts }, transaction ) - await FogPlatformService.markReconcileFailed(task.fogUuid, error, transaction) - }) + if (await isFogDeleteReconcileTask(task, transaction)) { + await FogPlatformStatusManager.setPhase(task.fogUuid, 'Deleting', { + lastError: errorMessage + }, transaction) + } else { + await FogPlatformService.markReconcileFailed(task.fogUuid, error, transaction) + } + }, { priority: PRIORITY_BACKGROUND, label: 'platformReconcile.fogTaskFailure' }) } async function handleServiceTaskFailure (task, error) { @@ -163,7 +196,7 @@ async function handleServiceTaskFailure (task, error) { const nextAttempts = (task.attempts != null ? task.attempts : 0) + 1 const isPermanent = nextAttempts >= maxAttempts - await databaseProvider.sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await ServicePlatformReconcileTaskManager.recordServiceTaskFailure( task.id, errorMessage, @@ -181,11 +214,13 @@ async function handleServiceTaskFailure (task, error) { transaction ) } - }) + }, { priority: PRIORITY_BACKGROUND, label: 'platformReconcile.serviceTaskFailure' }) } module.exports = { run, processNextFogTask, - processNextServiceTask + processNextServiceTask, + runFogReconcileForTask, + isFogDeleteReconcileTask } diff --git a/src/jobs/reconcile-outbox-drainer-job.js b/src/jobs/reconcile-outbox-drainer-job.js new file mode 100644 index 00000000..6d6aee59 --- /dev/null +++ b/src/jobs/reconcile-outbox-drainer-job.js @@ -0,0 +1,78 @@ +const config = require('../config') +const logger = require('../logger') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') +const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') +const NatsService = require('../services/nats-service') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') + +const DEFAULT_BATCH_SIZE = 32 + +async function run () { + try { + await drainOnce() + } catch (error) { + logger.error('Reconcile outbox drainer error:', error) + } finally { + const intervalSeconds = config.get('settings.reconcileOutboxDrainerIntervalSeconds', 1) + setTimeout(run, intervalSeconds * 1000) + } +} + +async function drainRow (row, transaction) { + const payload = ReconcileOutboxManager.parsePayload(row) + if (!payload) { + throw new Error(`Outbox row ${row.id} has empty payload`) + } + + switch (row.kind) { + case 'nats': + await NatsService.enqueueReconcileTask(payload, transaction) + break + case 'fog_platform': + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask(payload, transaction) + break + case 'service_platform': + await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask(payload, transaction) + break + default: + throw new Error(`Unknown reconcile outbox kind: ${row.kind}`) + } +} + +async function drainOnce () { + const batchSize = config.get('settings.reconcileOutboxDrainerBatchSize', DEFAULT_BATCH_SIZE) + + return runInTransaction(async (transaction) => { + const rows = await ReconcileOutboxManager.claimUnprocessed(batchSize, transaction) + if (!rows.length) { + return { processed: 0, failed: 0 } + } + + let processed = 0 + let failed = 0 + + for (const row of rows) { + try { + await drainRow(row, transaction) + await ReconcileOutboxManager.markProcessed(row.id, transaction) + processed += 1 + } catch (error) { + failed += 1 + logger.error(`Reconcile outbox drain failed for row ${row.id}: ${error.message}`) + await ReconcileOutboxManager.markFailed(row.id, error.message, transaction) + } + } + + if (processed > 0 || failed > 0) { + logger.debug('Reconcile outbox drainer batch complete', { processed, failed }) + } + + return { processed, failed } + }, { priority: PRIORITY_BACKGROUND, label: 'reconcileOutboxDrainer' }) +} + +module.exports = { + run, + drainOnce +} diff --git a/test/src/data/reconcile-outbox.test.js b/test/src/data/reconcile-outbox.test.js new file mode 100644 index 00000000..d1c498d3 --- /dev/null +++ b/test/src/data/reconcile-outbox.test.js @@ -0,0 +1,187 @@ +'use strict' + +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') +const sinon = require('sinon') + +const databaseProvider = require('../../../src/data/providers/database-factory') +const defineReconcileOutbox = require('../../../src/data/models/reconcileOutbox') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const { runInTransaction } = require('../../../src/helpers/transaction-runner') +const { drainOnce } = require('../../../src/jobs/reconcile-outbox-drainer-job') + +describe('reconcile-outbox', () => { + const sandbox = sinon.createSandbox() + let originalDbProvider + let sequelize + let dbPath + let ReconcileOutbox + + beforeEach(async () => { + originalDbProvider = process.env.DB_PROVIDER + delete process.env.DB_PROVIDER + + dbPath = path.join(os.tmpdir(), `controller-outbox-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + await sequelize.authenticate() + + ReconcileOutbox = defineReconcileOutbox(sequelize, Sequelize.DataTypes) + await ReconcileOutbox.sync() + + const models = require('../../../src/data/models') + models.ReconcileOutbox = ReconcileOutbox + models.sequelize = sequelize + sandbox.stub(databaseProvider, 'sequelize').value(sequelize) + }) + + afterEach(async () => { + sandbox.restore() + if (originalDbProvider === undefined) { + delete process.env.DB_PROVIDER + } else { + process.env.DB_PROVIDER = originalDbProvider + } + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + }) + + it('inserts outbox row in same commit as business write', async () => { + await runInTransaction(async (transaction) => { + await sequelize.query('CREATE TABLE IF NOT EXISTS business (id INTEGER PRIMARY KEY, label TEXT)', { transaction }) + await sequelize.query('INSERT INTO business (label) VALUES (\'created\')', { transaction }) + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-a', + reason: 'spec-changed', + specGeneration: 1 + }, transaction) + }) + + const rows = await ReconcileOutbox.findAll() + expect(rows).to.have.length(1) + expect(rows[0].kind).to.equal('fog_platform') + expect(rows[0].processedAt).to.be.null + }) + + it('rolls back outbox row when transaction fails', async () => { + try { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-b', + reason: 'delete' + }, transaction) + throw new Error('forced rollback') + }) + } catch (error) { + expect(error.message).to.equal('forced rollback') + } + + const rows = await ReconcileOutbox.findAll() + expect(rows).to.have.length(0) + }) + + it('deduplicates enqueue by idempotency key', async () => { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-c', + reason: 'spec-changed', + specGeneration: 2 + }, transaction) + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-c', + reason: 'spec-changed', + specGeneration: 2 + }, transaction) + }) + + const rows = await ReconcileOutbox.findAll() + expect(rows).to.have.length(1) + }) + + it('re-opens processed outbox row when the same idempotency key is enqueued again', async () => { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueNats({ + reason: 'cluster-routes-changed', + fogUuids: ['fog-other'] + }, transaction) + }) + + await runInTransaction(async (transaction) => { + const row = await ReconcileOutbox.findOne({ + where: { idempotencyKey: 'nats:cluster-routes-changed:null:null:null:fog-other' } + }, transaction) + await ReconcileOutboxManager.markProcessed(row.id, transaction) + }) + + await runInTransaction(async (transaction) => { + const row = await ReconcileOutboxManager.enqueueNats({ + reason: 'cluster-routes-changed', + fogUuids: ['fog-other'] + }, transaction) + expect(row.processedAt).to.be.null + await sequelize.query('SELECT 1 AS ok', { transaction, type: sequelize.QueryTypes.SELECT }) + }) + + const rows = await ReconcileOutbox.findAll() + expect(rows).to.have.length(1) + expect(rows[0].processedAt).to.be.null + }) + + it('drains unprocessed row into reconcile task and marks processed', async () => { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-d', + reason: 'manual-retry', + specGeneration: 5 + }, transaction) + }) + + const enqueueStub = sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 99 }) + + const result = await drainOnce() + expect(result.processed).to.equal(1) + expect(result.failed).to.equal(0) + expect(enqueueStub).to.have.been.calledOnceWith({ + fogUuid: 'fog-d', + reason: 'manual-retry', + specGeneration: 5 + }, sinon.match.object) + + const row = await ReconcileOutbox.findOne({ where: { idempotencyKey: 'fp:fog-d:manual-retry:5' } }) + expect(row.processedAt).to.not.be.null + expect(row.lastError).to.be.null + }) + + it('records lastError when drain fails', async () => { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-e', + reason: 'delete' + }, transaction) + }) + + sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').rejects(new Error('enqueue failed')) + + const result = await drainOnce() + expect(result.processed).to.equal(0) + expect(result.failed).to.equal(1) + + const row = await ReconcileOutbox.findOne({ where: { idempotencyKey: 'fp:fog-e:delete:null' } }) + expect(row.processedAt).to.be.null + expect(row.lastError).to.equal('enqueue failed') + }) +}) diff --git a/test/src/data/reconcile-task-claim-ha.test.js b/test/src/data/reconcile-task-claim-ha.test.js new file mode 100644 index 00000000..60ab88b1 --- /dev/null +++ b/test/src/data/reconcile-task-claim-ha.test.js @@ -0,0 +1,183 @@ +'use strict' + +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') +const sinon = require('sinon') + +const databaseProvider = require('../../../src/data/providers/database-factory') +const defineFogPlatformReconcileTask = require('../../../src/data/models/fogPlatformReconcileTask') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const dbDialect = require('../../../src/helpers/db-dialect') + +describe('reconcile-task-claim-ha', () => { + const sandbox = sinon.createSandbox() + + describe('SKIP LOCKED claim path (mysql/postgres dialect)', () => { + let queryStub + let originalDbProvider + + beforeEach(() => { + originalDbProvider = process.env.DB_PROVIDER + process.env.DB_PROVIDER = 'postgres' + + queryStub = sandbox.stub(databaseProvider.sequelize, 'query').resolves([]) + sandbox.stub(databaseProvider.sequelize, 'getDialect').returns('postgres') + sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns({ + getTableName: () => 'FogPlatformReconcileTasks', + build: (row) => row, + update: sandbox.stub().resolves([0]), + findOne: sandbox.stub().resolves(null) + }) + }) + + afterEach(() => { + if (originalDbProvider === undefined) { + delete process.env.DB_PROVIDER + } else { + process.env.DB_PROVIDER = originalDbProvider + } + sandbox.restore() + }) + + it('uses FOR UPDATE SKIP LOCKED when dialect supports it', async () => { + await FogPlatformReconcileTaskManager.claimNextFogTask('controller-a', 300) + + expect(queryStub).to.have.been.calledOnce + const sql = queryStub.firstCall.args[0] + expect(sql).to.include('FOR UPDATE SKIP LOCKED') + expect(sql).to.include('"FogPlatformReconcileTasks"') + }) + }) + + describe('sqlite claim path', () => { + let sequelize + let dbPath + let FogPlatformReconcileTask + + beforeEach(async () => { + dbPath = path.join(os.tmpdir(), `controller-claim-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + await sequelize.authenticate() + + FogPlatformReconcileTask = defineFogPlatformReconcileTask(sequelize, Sequelize.DataTypes) + await FogPlatformReconcileTask.sync() + + const models = require('../../../src/data/models') + models.FogPlatformReconcileTask = FogPlatformReconcileTask + models.sequelize = sequelize + sandbox.stub(databaseProvider, 'sequelize').value(sequelize) + }) + + afterEach(async () => { + sandbox.restore() + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + }) + + it('claims each pending task at most once under concurrent claimers', async () => { + await FogPlatformReconcileTask.bulkCreate([ + { fogUuid: 'fog-1', reason: 'spec-changed', status: 'pending' }, + { fogUuid: 'fog-2', reason: 'spec-changed', status: 'pending' }, + { fogUuid: 'fog-3', reason: 'spec-changed', status: 'pending' } + ]) + + const claims = await Promise.all([ + FogPlatformReconcileTaskManager.claimNextFogTask('controller-a', 300), + FogPlatformReconcileTaskManager.claimNextFogTask('controller-b', 300), + FogPlatformReconcileTaskManager.claimNextFogTask('controller-c', 300) + ]) + + const claimedIds = claims.filter(Boolean).map((task) => task.id) + expect(claimedIds).to.have.length(3) + expect(new Set(claimedIds).size).to.equal(3) + + const leaders = await FogPlatformReconcileTask.findAll({ + where: { status: 'in_progress' }, + order: [['id', 'ASC']] + }) + expect(leaders).to.have.length(3) + expect(new Set(leaders.map((row) => row.leaderUuid)).size).to.equal(3) + }) + }) + + describe('concurrent claims on mysql/postgres (integration)', function () { + const haUrl = process.env.RECONCILE_CLAIM_HA_URL + + before(function () { + if (!haUrl) { + this.skip() + } + }) + + it('claims each task at most once with two parallel connections', async function () { + this.timeout(30000) + + const dialect = process.env.RECONCILE_CLAIM_HA_DIALECT || 'postgres' + const sequelizeA = new Sequelize(haUrl, { dialect, logging: false }) + const sequelizeB = new Sequelize(haUrl, { dialect, logging: false }) + + const FogTaskA = defineFogPlatformReconcileTask(sequelizeA, Sequelize.DataTypes) + const FogTaskB = defineFogPlatformReconcileTask(sequelizeB, Sequelize.DataTypes) + await FogTaskA.sync({ force: true }) + + await FogTaskA.bulkCreate([ + { fogUuid: 'fog-ha-1', reason: 'spec-changed', status: 'pending' }, + { fogUuid: 'fog-ha-2', reason: 'spec-changed', status: 'pending' } + ]) + + const claimWithConnection = async (sequelize, controllerUuid) => { + return sequelize.transaction(async (transaction) => { + const rows = await sequelize.query( + `SELECT id FROM "FogPlatformReconcileTasks" + WHERE status IN ('pending', 'in_progress') + AND (leader_uuid IS NULL) + ORDER BY id ASC + LIMIT 1 + FOR UPDATE SKIP LOCKED`, + { type: sequelize.QueryTypes.SELECT, transaction } + ) + if (!rows.length) { + return null + } + await sequelize.query( + `UPDATE "FogPlatformReconcileTasks" + SET leader_uuid = :leader, claimed_at = NOW(), status = 'in_progress' + WHERE id = :id AND leader_uuid IS NULL`, + { + replacements: { leader: controllerUuid, id: rows[0].id }, + transaction + } + ) + return rows[0].id + }) + } + + const [idA, idB] = await Promise.all([ + claimWithConnection(sequelizeA, 'replica-a'), + claimWithConnection(sequelizeB, 'replica-b') + ]) + + expect(idA).to.be.a('number') + expect(idB).to.be.a('number') + expect(idA).to.not.equal(idB) + + await sequelizeA.close() + await sequelizeB.close() + }) + }) +}) diff --git a/test/src/helpers/reconcile-outbox-keys.test.js b/test/src/helpers/reconcile-outbox-keys.test.js new file mode 100644 index 00000000..1ec8b5e4 --- /dev/null +++ b/test/src/helpers/reconcile-outbox-keys.test.js @@ -0,0 +1,56 @@ +const { expect } = require('chai') +const crypto = require('crypto') +const { + buildFogPlatformIdempotencyKey, + buildServicePlatformIdempotencyKey, + buildNatsIdempotencyKey, + buildIdempotencyKey +} = require('../../../src/helpers/reconcile-outbox-keys') + +describe('reconcile-outbox-keys', () => { + it('builds stable fog platform keys', () => { + const key = buildFogPlatformIdempotencyKey({ + fogUuid: 'fog-1', + reason: 'spec-changed', + specGeneration: 3 + }) + expect(key).to.equal('fp:fog-1:spec-changed:3') + }) + + it('builds service platform keys with snapshot hash', () => { + const snapshot = { name: 'svc-a', resource: '10m' } + const key = buildServicePlatformIdempotencyKey({ + serviceName: 'svc-a', + reason: 'spec-changed', + specSnapshot: snapshot + }) + const expectedHash = crypto.createHash('sha256').update(JSON.stringify(snapshot)).digest('hex').slice(0, 16) + expect(key).to.equal(`sp:svc-a:spec-changed:${expectedHash}`) + }) + + it('builds nats keys with sorted fog uuids', () => { + const key = buildNatsIdempotencyKey({ + reason: 'cluster-routes-changed', + applicationId: null, + accountRuleId: null, + userRuleId: null, + fogUuids: ['b', 'a'] + }) + expect(key).to.equal('nats:cluster-routes-changed:null:null:null:a,b') + }) + + it('builds nats keys without fog uuids from scope fields', () => { + const key = buildNatsIdempotencyKey({ + reason: 'account-created', + applicationId: 42, + accountRuleId: null, + userRuleId: null + }) + expect(key).to.equal('nats:account-created:42:null:null') + }) + + it('routes buildIdempotencyKey by kind', () => { + expect(buildIdempotencyKey('fog_platform', { fogUuid: 'x', reason: 'delete' })) + .to.equal('fp:x:delete:null') + }) +}) From 3993a4d676934d8a8dd0cfc236acdb3412051c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:38:08 +0300 Subject: [PATCH 05/32] Add transaction safety settings and derive component labels from distribution. Configure write queue depth, busy retry, outbox drainer, fog delete staleness, sqlite enterprise warning threshold, and mysql/postgres pool defaults. --- src/config/config.yaml | 21 ++++++++++-- src/config/env-mapping.js | 7 ++++ src/config/flavor.js | 31 +++++++++++++++-- test/src/config/flavor.test.js | 63 ++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 test/src/config/flavor.test.js diff --git a/src/config/config.yaml b/src/config/config.yaml index 3ee59f6b..c4b97789 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -88,6 +88,7 @@ settings: natsReconcileWorkerIntervalSeconds: 3 # NATS reconcile worker poll interval in seconds (default: 3) fogPlatformReconcileWorkerIntervalSeconds: 3 # Platform reconcile worker poll interval in seconds (default: 3) fogPlatformReconcileTaskStalenessSeconds: 300 # Stale fog platform task reclaim in seconds (default: 300 = 5 minutes) + fogPlatformDeleteReconcileTaskStalenessSeconds: 60 # Stale delete task reclaim in seconds (default: 60) fogPlatformReconcileMaxAttempts: 10 # Permanent fail threshold for fog platform tasks (default: 10) fogPlatformReconcileBackoffBaseSeconds: 5 # Exponential backoff base for fog platform tasks (default: 5) fogPlatformSweepIntervalSeconds: 900 # Platform drift sweep interval in seconds (default: 900 = 15 minutes) @@ -95,12 +96,22 @@ settings: hubRouterConfigLockTimeoutSeconds: 120 # Hub router ConfigMap lock wait in seconds (default: 120) serviceLoadBalancerWatchTimeoutSeconds: 300 # LoadBalancer IP watch timeout in service reconcile (default: 300) jobStartupDelaySeconds: 3 # Delay before reconcile-heavy background jobs start (default: 3) + reconcileOutboxDrainerIntervalSeconds: 1 # Outbox drainer poll interval in seconds (default: 1) + reconcileOutboxDrainerBatchSize: 32 # Max outbox rows per drainer batch (default: 32) wsSessionReconcileIntervalSeconds: 60 # WS exec/log stale DB row reconcile interval (R89) + sqliteEnterpriseFogWarningThreshold: 50 # Log + metric when sqlite fog count exceeds (R124) + dbWriteQueueMaxDepth: 256 # SQLite write queue depth before error log (R123) + dbBusyRetryMaxAttempts: 8 # SQLITE_BUSY retry attempts per transaction (Plan 19) + dbBusyRetryBaseMs: 25 # Exponential backoff base for busy retry (Plan 19) # Database Configuration database: provider: sqlite # Database provider (sqlite/mysql/postgres) - # mysql: + mysql: + pool: + max: 10 + min: 0 + idle: 20000 # host: "" # MySQL host # port: 3306 # MySQL port # username: "" # MySQL username @@ -108,7 +119,11 @@ database: # databaseName: "" # MySQL database name # useSSL: false # Use SSL for MySQL connection # sslCA: "" # MySQL SSL CA in base64 encoded string - # postgres: + postgres: + pool: + max: 10 + min: 0 + idle: 20000 # host: "" # PostgreSQL host # port: 5432 # PostgreSQL port # username: "" # PostgreSQL username @@ -165,7 +180,7 @@ flavor: distribution: datasance rbacApiVersion: datasance.com/v3 serviceAnnotationTag: service.iofog.org/tag - componentLabelDomain: iofog.org/component + # componentLabelDomain: derived from distribution (datasance.com/component | iofog.org/component) # Bridge Ports Configuration for Services bridgePorts: diff --git a/src/config/env-mapping.js b/src/config/env-mapping.js index 31844774..999a79d2 100644 --- a/src/config/env-mapping.js +++ b/src/config/env-mapping.js @@ -68,6 +68,7 @@ module.exports = { CONTROLLER_CLEANUP_INTERVAL: 'settings.controllerCleanupInterval', FOG_PLATFORM_RECONCILE_WORKER_INTERVAL_SECONDS: 'settings.fogPlatformReconcileWorkerIntervalSeconds', FOG_PLATFORM_RECONCILE_TASK_STALENESS_SECONDS: 'settings.fogPlatformReconcileTaskStalenessSeconds', + FOG_PLATFORM_DELETE_RECONCILE_TASK_STALENESS_SECONDS: 'settings.fogPlatformDeleteReconcileTaskStalenessSeconds', FOG_PLATFORM_RECONCILE_MAX_ATTEMPTS: 'settings.fogPlatformReconcileMaxAttempts', FOG_PLATFORM_RECONCILE_BACKOFF_BASE_SECONDS: 'settings.fogPlatformReconcileBackoffBaseSeconds', FOG_PLATFORM_SWEEP_INTERVAL_SECONDS: 'settings.fogPlatformSweepIntervalSeconds', @@ -75,6 +76,12 @@ module.exports = { HUB_ROUTER_CONFIG_LOCK_TIMEOUT_SECONDS: 'settings.hubRouterConfigLockTimeoutSeconds', SERVICE_LOAD_BALANCER_WATCH_TIMEOUT_SECONDS: 'settings.serviceLoadBalancerWatchTimeoutSeconds', JOB_STARTUP_DELAY_SECONDS: 'settings.jobStartupDelaySeconds', + RECONCILE_OUTBOX_DRAINER_INTERVAL_SECONDS: 'settings.reconcileOutboxDrainerIntervalSeconds', + RECONCILE_OUTBOX_DRAINER_BATCH_SIZE: 'settings.reconcileOutboxDrainerBatchSize', + SQLITE_ENTERPRISE_FOG_WARNING_THRESHOLD: 'settings.sqliteEnterpriseFogWarningThreshold', + DB_WRITE_QUEUE_MAX_DEPTH: 'settings.dbWriteQueueMaxDepth', + DB_BUSY_RETRY_MAX_ATTEMPTS: 'settings.dbBusyRetryMaxAttempts', + DB_BUSY_RETRY_BASE_MS: 'settings.dbBusyRetryBaseMs', // Database Configuration DB_PROVIDER: 'database.provider', diff --git a/src/config/flavor.js b/src/config/flavor.js index 615473bf..8110ec6e 100644 --- a/src/config/flavor.js +++ b/src/config/flavor.js @@ -6,12 +6,28 @@ const DEFAULT_SERVICE_ANNOTATION_TAG = 'service.iofog.org/tag' const DEFAULT_COMPONENT_LABEL_DOMAIN = 'iofog.org/component' const DEFAULT_APP_LABEL = 'iofog' +const DISTRIBUTION_COMPONENT_LABEL = { + datasance: 'datasance.com/component', + iofog: 'iofog.org/component' +} + function getRbacApiVersion () { return process.env.RBAC_API_VERSION || config.get('flavor.rbacApiVersion', DEFAULT_RBAC_API_VERSION) } +function getConfiguredDistribution () { + if (process.env.CONTROLLER_DISTRIBUTION) { + return process.env.CONTROLLER_DISTRIBUTION + } + const fromConfig = config.get('flavor.distribution') + if (fromConfig != null && fromConfig !== '') { + return fromConfig + } + return null +} + function getControllerDistribution () { - return process.env.CONTROLLER_DISTRIBUTION || config.get('flavor.distribution', DEFAULT_CONTROLLER_DISTRIBUTION) + return getConfiguredDistribution() || DEFAULT_CONTROLLER_DISTRIBUTION } function getServiceAnnotationTag () { @@ -19,7 +35,18 @@ function getServiceAnnotationTag () { } function getComponentLabelKey () { - return process.env.COMPONENT_LABEL_DOMAIN || config.get('flavor.componentLabelDomain', DEFAULT_COMPONENT_LABEL_DOMAIN) + if (process.env.COMPONENT_LABEL_DOMAIN) { + return process.env.COMPONENT_LABEL_DOMAIN + } + const fromConfig = config.get('flavor.componentLabelDomain') + if (fromConfig != null && fromConfig !== '') { + return fromConfig + } + const distribution = getConfiguredDistribution() + if (distribution != null && DISTRIBUTION_COMPONENT_LABEL[distribution]) { + return DISTRIBUTION_COMPONENT_LABEL[distribution] + } + return DEFAULT_COMPONENT_LABEL_DOMAIN } function getAppLabelKey () { diff --git a/test/src/config/flavor.test.js b/test/src/config/flavor.test.js new file mode 100644 index 00000000..9df1f576 --- /dev/null +++ b/test/src/config/flavor.test.js @@ -0,0 +1,63 @@ +'use strict' + +const { expect } = require('chai') +const sinon = require('sinon') + +const config = require('../../../src/config') +const flavor = require('../../../src/config/flavor') + +describe('flavor', () => { + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => { + delete process.env.CONTROLLER_DISTRIBUTION + delete process.env.COMPONENT_LABEL_DOMAIN + $sandbox.restore() + }) + + function stubFlavorConfig ({ distribution, componentLabelDomain } = {}) { + $sandbox.stub(config, 'get').callsFake((key) => { + if (key === 'flavor.distribution') { + return distribution + } + if (key === 'flavor.componentLabelDomain') { + return componentLabelDomain + } + return undefined + }) + } + + describe('.getComponentLabelKey()', () => { + it('returns datasance.com/component when distribution is datasance', () => { + stubFlavorConfig({ distribution: 'datasance' }) + expect(flavor.getComponentLabelKey()).to.equal('datasance.com/component') + }) + + it('returns iofog.org/component when distribution is iofog', () => { + stubFlavorConfig({ distribution: 'iofog' }) + expect(flavor.getComponentLabelKey()).to.equal('iofog.org/component') + }) + + it('returns iofog.org/component when distribution is unset', () => { + stubFlavorConfig() + expect(flavor.getComponentLabelKey()).to.equal('iofog.org/component') + }) + + it('prefers CONTROLLER_DISTRIBUTION env over config', () => { + stubFlavorConfig({ distribution: 'datasance' }) + process.env.CONTROLLER_DISTRIBUTION = 'iofog' + expect(flavor.getComponentLabelKey()).to.equal('iofog.org/component') + }) + + it('prefers COMPONENT_LABEL_DOMAIN env over distribution', () => { + stubFlavorConfig({ distribution: 'datasance' }) + process.env.COMPONENT_LABEL_DOMAIN = 'custom.example/component' + expect(flavor.getComponentLabelKey()).to.equal('custom.example/component') + }) + + it('prefers flavor.componentLabelDomain config over distribution', () => { + stubFlavorConfig({ distribution: 'datasance', componentLabelDomain: 'override.example/component' }) + expect(flavor.getComponentLabelKey()).to.equal('override.example/component') + }) + }) +}) From e3b113a27ef7093c4b1a4da8ccd7ebb582504b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:38:36 +0300 Subject: [PATCH 06/32] Route data managers and OIDC adapter through the transaction runner. Remove bypassQueue paths, fix volume mount transaction option passing, and ensure manager writes use real Sequelize transactions. --- src/data/adapters/oidc-provider-adapter.js | 167 ++++++++------- src/data/managers/certificate-manager.js | 190 ++++++------------ src/data/managers/config-map-manager.js | 18 +- src/data/managers/event-manager.js | 8 +- src/data/managers/fog-used-token-manager.js | 64 ++---- .../hub-router-config-lock-manager.js | 13 -- src/data/managers/iofog-manager.js | 6 +- src/data/managers/iofog-public-key-manager.js | 75 ++----- src/data/managers/nats-connection-manager.js | 5 +- src/data/managers/nats-instance-manager.js | 5 +- .../managers/rbac-cache-version-manager.js | 22 +- src/data/managers/rbac-role-manager.js | 22 +- .../managers/rbac-service-account-manager.js | 20 +- src/data/managers/registry-manager.js | 11 +- src/data/managers/secret-manager.js | 17 +- src/data/managers/volume-mapping-manager.js | 5 +- src/data/managers/volume-mounting-manager.js | 20 +- 17 files changed, 269 insertions(+), 399 deletions(-) diff --git a/src/data/adapters/oidc-provider-adapter.js b/src/data/adapters/oidc-provider-adapter.js index e2a28464..9cc0358d 100644 --- a/src/data/adapters/oidc-provider-adapter.js +++ b/src/data/adapters/oidc-provider-adapter.js @@ -1,6 +1,7 @@ 'use strict' const { Op } = require('sequelize') +const { runInTransaction, PRIORITY_INTERACTIVE, PRIORITY_BACKGROUND } = require('../../helpers/transaction-runner') function isExpired (expiresAt) { return expiresAt && expiresAt <= new Date() @@ -24,97 +25,123 @@ class OidcProviderAdapter { const AuthOidcProviderState = this.getStateModel() const expiresAt = expiresIn ? new Date(Date.now() + expiresIn * 1000) : null - await AuthOidcProviderState.upsert({ - model: this.name, - recordId: id, - payload: JSON.stringify(payload), - expiresAt, - grantId: payload.grantId || null, - uid: payload.uid || null, - userCode: payload.userCode || null, - consumed: false, - consumedAt: null - }) + await runInTransaction(async (transaction) => { + await AuthOidcProviderState.upsert({ + model: this.name, + recordId: id, + payload: JSON.stringify(payload), + expiresAt, + grantId: payload.grantId || null, + uid: payload.uid || null, + userCode: payload.userCode || null, + consumed: false, + consumedAt: null + }, { transaction, conflictFields: ['model', 'record_id'] }) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.upsert' }) } async find (id) { const AuthOidcProviderState = this.getStateModel() - const row = await AuthOidcProviderState.findOne({ - where: { - model: this.name, - recordId: id - } - }) - if (!row || isExpired(row.expiresAt)) { - return undefined - } + return runInTransaction(async (transaction) => { + const row = await AuthOidcProviderState.findOne({ + where: { + model: this.name, + recordId: id + }, + transaction + }) + + if (!row || isExpired(row.expiresAt)) { + return undefined + } - return rowToPayload(row) + return rowToPayload(row) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.find' }) } async findByUserCode (userCode) { const AuthOidcProviderState = this.getStateModel() - const row = await AuthOidcProviderState.findOne({ - where: { - model: this.name, - userCode - } - }) - if (!row || isExpired(row.expiresAt)) { - return undefined - } + return runInTransaction(async (transaction) => { + const row = await AuthOidcProviderState.findOne({ + where: { + model: this.name, + userCode + }, + transaction + }) + + if (!row || isExpired(row.expiresAt)) { + return undefined + } - return rowToPayload(row) + return rowToPayload(row) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.findByUserCode' }) } async findByUid (uid) { const AuthOidcProviderState = this.getStateModel() - const row = await AuthOidcProviderState.findOne({ - where: { - model: this.name, - uid - } - }) - if (!row || isExpired(row.expiresAt)) { - return undefined - } + return runInTransaction(async (transaction) => { + const row = await AuthOidcProviderState.findOne({ + where: { + model: this.name, + uid + }, + transaction + }) + + if (!row || isExpired(row.expiresAt)) { + return undefined + } - return rowToPayload(row) + return rowToPayload(row) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.findByUid' }) } async consume (id) { const AuthOidcProviderState = this.getStateModel() - await AuthOidcProviderState.update({ - consumed: true, - consumedAt: new Date() - }, { - where: { - model: this.name, - recordId: id - } - }) + + await runInTransaction(async (transaction) => { + await AuthOidcProviderState.update({ + consumed: true, + consumedAt: new Date() + }, { + where: { + model: this.name, + recordId: id + }, + transaction + }) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.consume' }) } async destroy (id) { const AuthOidcProviderState = this.getStateModel() - await AuthOidcProviderState.destroy({ - where: { - model: this.name, - recordId: id - } - }) + + await runInTransaction(async (transaction) => { + await AuthOidcProviderState.destroy({ + where: { + model: this.name, + recordId: id + }, + transaction + }) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.destroy' }) } async revokeByGrantId (grantId) { const AuthOidcProviderState = this.getStateModel() - await AuthOidcProviderState.destroy({ - where: { - grantId - } - }) + + await runInTransaction(async (transaction) => { + await AuthOidcProviderState.destroy({ + where: { + grantId + }, + transaction + }) + }, { priority: PRIORITY_INTERACTIVE, label: 'oidc.adapter.revokeByGrantId' }) } } @@ -124,13 +151,17 @@ function createOidcProviderAdapterFactory (getStateModel) { async function purgeExpiredOidcProviderStates (getStateModel) { const AuthOidcProviderState = getStateModel() - await AuthOidcProviderState.destroy({ - where: { - expiresAt: { - [Op.lte]: new Date() - } - } - }) + + await runInTransaction(async (transaction) => { + await AuthOidcProviderState.destroy({ + where: { + expiresAt: { + [Op.lte]: new Date() + } + }, + transaction + }) + }, { priority: PRIORITY_BACKGROUND, label: 'oidc.adapter.purgeExpired' }) } module.exports = { diff --git a/src/data/managers/certificate-manager.js b/src/data/managers/certificate-manager.js index f55bf126..933a3c31 100644 --- a/src/data/managers/certificate-manager.js +++ b/src/data/managers/certificate-manager.js @@ -11,11 +11,9 @@ class CertificateManager extends BaseManager { } async createCertificateRecord (certData, transaction) { - // First find the secret by name to get its ID const secret = await SecretManager.findOne({ name: certData.name }, transaction) if (secret) { - // Link the certificate to the secret certData.secretId = secret.id } @@ -25,17 +23,11 @@ class CertificateManager extends BaseManager { async findCertificatesByCA (caId, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? { - where: { signedById: caId }, - include: ['secret'] - } - : { - where: { signedById: caId }, - include: ['secret'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: { signedById: caId }, + include: ['secret'], + transaction + }) } async findExpiringCertificates (days = 30, transaction) { @@ -44,65 +36,42 @@ class CertificateManager extends BaseManager { const expirationDate = new Date() expirationDate.setDate(expirationDate.getDate() + days) - const options = transaction.fakeTransaction - ? { - where: { - validTo: { [Op.lt]: expirationDate } - }, - include: ['signingCA'] - } - : { - where: { - validTo: { [Op.lt]: expirationDate } - }, - include: ['signingCA'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: { + validTo: { [Op.lt]: expirationDate } + }, + include: ['signingCA'], + transaction + }) } async findCertificateByName (name, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? { - where: { name }, - include: ['signingCA', 'secret'] - } - : { - where: { name }, - include: ['signingCA', 'secret'], - transaction - } - return this.getEntity().findOne(options) + return this.getEntity().findOne({ + where: { name }, + include: ['signingCA', 'secret'], + transaction + }) } async findAllCAs (transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? { - where: { isCA: true }, - include: ['secret'] - } - : { - where: { isCA: true }, - include: ['secret'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: { isCA: true }, + include: ['secret'], + transaction + }) } async findAllCertificates (transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? { include: ['signingCA', 'secret'] } - : { - include: ['signingCA', 'secret'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + include: ['signingCA', 'secret'], + transaction + }) } async deleteCertificate (name, transaction) { @@ -112,20 +81,15 @@ class CertificateManager extends BaseManager { async updateCertificate (id, updates, transaction) { AppHelper.checkTransaction(transaction) - // Find existing certificate - const options = transaction.fakeTransaction - ? { where: { id } } - : { - where: { id }, - transaction - } - const cert = await this.getEntity().findOne(options) + const cert = await this.getEntity().findOne({ + where: { id }, + transaction + }) if (!cert) { throw new Error(`Certificate with id ${id} not found`) } - // Update certificate return this.update({ id }, updates, transaction) } @@ -134,38 +98,24 @@ class CertificateManager extends BaseManager { const currentDate = new Date() - const options = transaction.fakeTransaction - ? { - where: { - validTo: { [Op.lt]: currentDate } - }, - include: ['signingCA', 'secret'] - } - : { - where: { - validTo: { [Op.lt]: currentDate } - }, - include: ['signingCA', 'secret'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: { + validTo: { [Op.lt]: currentDate } + }, + include: ['signingCA', 'secret'], + transaction + }) } async getCertificateChain (certId, transaction) { AppHelper.checkTransaction(transaction) const chain = [] - const options = transaction.fakeTransaction - ? { - where: { id: certId }, - include: ['signingCA', 'secret'] - } - : { - where: { id: certId }, - include: ['signingCA', 'secret'], - transaction - } - let currentCert = await this.getEntity().findOne(options) + let currentCert = await this.getEntity().findOne({ + where: { id: certId }, + include: ['signingCA', 'secret'], + transaction + }) if (!currentCert) { return chain @@ -173,12 +123,12 @@ class CertificateManager extends BaseManager { chain.push(currentCert) - // Traverse up the chain of signing CAs while (currentCert.signingCA) { - const parentOptions = transaction.fakeTransaction - ? { where: { id: currentCert.signedById }, include: ['signingCA', 'secret'] } - : { where: { id: currentCert.signedById }, include: ['signingCA', 'secret'], transaction } - currentCert = await this.getEntity().findOne(parentOptions) + currentCert = await this.getEntity().findOne({ + where: { id: currentCert.signedById }, + include: ['signingCA', 'secret'], + transaction + }) if (currentCert) { chain.push(currentCert) @@ -193,48 +143,30 @@ class CertificateManager extends BaseManager { async findCertificatesForRenewal (days = 30, transaction) { AppHelper.checkTransaction(transaction) - // Calculate the date range - we want certificates that expire between now and (now + days) const now = new Date() const futureDate = new Date() futureDate.setDate(futureDate.getDate() + days) - const options = transaction.fakeTransaction - ? { - where: { - validTo: { - [Op.gt]: now, - [Op.lt]: futureDate - } - }, - include: ['signingCA', 'secret'] - } - : { - where: { - validTo: { - [Op.gt]: now, - [Op.lt]: futureDate - } - }, - include: ['signingCA', 'secret'], - transaction + return this.getEntity().findAll({ + where: { + validTo: { + [Op.gt]: now, + [Op.lt]: futureDate } - return this.getEntity().findAll(options) + }, + include: ['signingCA', 'secret'], + transaction + }) } async getCertificateChildren (caId, transaction) { AppHelper.checkTransaction(transaction) - const options = transaction.fakeTransaction - ? { - where: { signedById: caId }, - include: ['secret'] - } - : { - where: { signedById: caId }, - include: ['secret'], - transaction - } - return this.getEntity().findAll(options) + return this.getEntity().findAll({ + where: { signedById: caId }, + include: ['secret'], + transaction + }) } } diff --git a/src/data/managers/config-map-manager.js b/src/data/managers/config-map-manager.js index 0b61b396..a96bbeb6 100644 --- a/src/data/managers/config-map-manager.js +++ b/src/data/managers/config-map-manager.js @@ -1,5 +1,6 @@ const BaseManager = require('./base-manager') -const SecretHelper = require('../../helpers/secret-helper') +const vaultManager = require('../../vault/vault-manager') +const { scheduleVaultDeleteAfterCommit } = require('../../helpers/vault-transaction-helper') const models = require('../models') const ConfigMap = models.ConfigMap @@ -31,8 +32,7 @@ class ConfigMapManager extends BaseManager { existing.useVault = useVault !== null ? useVault : existing.useVault // Save the instance - this triggers beforeSave hook which handles encryption/vault - const options = transaction.fakeTransaction ? {} : { transaction } - await existing.save(options) + await existing.save({ transaction }) return existing } @@ -61,16 +61,12 @@ class ConfigMapManager extends BaseManager { } async deleteConfigMap (name, transaction) { - // Get ConfigMap to check if it's in vault const configMap = await this.findOne({ name }, transaction) - if (configMap && configMap.useVault) { - // Delete from vault if it was stored there - const vaultManager = require('../../vault/vault-manager') - if (vaultManager.isEnabled()) { - await SecretHelper.deleteSecret(name, 'configmap') - } + const useVault = configMap && configMap.useVault + await this.delete({ name }, transaction) + if (useVault && vaultManager.isEnabled()) { + scheduleVaultDeleteAfterCommit(transaction, name, 'configmap') } - return this.delete({ name }, transaction) } } diff --git a/src/data/managers/event-manager.js b/src/data/managers/event-manager.js index 302c0b9c..3c2abd64 100644 --- a/src/data/managers/event-manager.js +++ b/src/data/managers/event-manager.js @@ -85,9 +85,7 @@ class EventManager extends BaseManager { offset: Number(offset) // Ensure it's a number } - if (!transaction.fakeTransaction) { - options.transaction = transaction - } + options.transaction = transaction const { count, rows } = await Event.findAndCountAll(options) @@ -126,9 +124,7 @@ class EventManager extends BaseManager { } // If days = 0, where clause is empty, so all events will be deleted - if (!transaction.fakeTransaction) { - options.transaction = transaction - } + options.transaction = transaction const deletedCount = await Event.destroy(options) return deletedCount diff --git a/src/data/managers/fog-used-token-manager.js b/src/data/managers/fog-used-token-manager.js index aa534227..65961669 100644 --- a/src/data/managers/fog-used-token-manager.js +++ b/src/data/managers/fog-used-token-manager.js @@ -1,19 +1,13 @@ const models = require('../models') const logger = require('../../logger') +const AppHelper = require('../../helpers/app-helper') const { Op } = require('sequelize') class FogUsedTokenManager { - /** - * Store a JTI (JWT ID) to mark it as used - * @param {string} jti - The JWT ID - * @param {string} fogUuid - The UUID of the fog node - * @param {number} exp - The expiration timestamp - * @param {Object} transaction - Sequelize transaction - * @returns {Promise} - */ static async storeJti (jti, fogUuid, exp, transaction) { + AppHelper.checkTransaction(transaction) + try { - // Input validation if (!jti || typeof jti !== 'string') { throw new Error('JTI must be a non-empty string') } @@ -21,27 +15,17 @@ class FogUsedTokenManager { throw new Error('Fog UUID must be a non-empty string') } - // Ensure exp is a valid integer (Unix timestamp) const expiryTime = parseInt(exp, 10) if (isNaN(expiryTime) || expiryTime <= 0) { throw new Error('Expiration timestamp must be a positive integer') } - // Prepare the data object - const tokenData = { + await models.FogUsedToken.create({ jti, iofogUuid: fogUuid, expiryTime - } - - // Create the record with or without transaction - if (!transaction || transaction.fakeTransaction) { - await models.FogUsedToken.create(tokenData) - } else { - await models.FogUsedToken.create(tokenData, { transaction }) - } + }, { transaction }) } catch (error) { - // Check if it's a duplicate JTI error if (error.name === 'SequelizeUniqueConstraintError' && error.fields && error.fields.jti) { logger.warn(`JTI already exists: ${jti}`) throw new Error('JWT token already used') @@ -52,27 +36,14 @@ class FogUsedTokenManager { } } - /** - * Check if a JTI has already been used - * @param {string} jti - The JWT ID to check - * @param {Object} transaction - Sequelize transaction - * @returns {Promise} True if the JTI has been used, false otherwise - */ static async isJtiUsed (jti, transaction) { + AppHelper.checkTransaction(transaction) + try { - let token - if (!transaction || transaction.fakeTransaction) { - // If no transaction or fake transaction, query without transaction - token = await models.FogUsedToken.findOne({ - where: { jti } - }) - } else { - // Use the provided transaction - token = await models.FogUsedToken.findOne({ - where: { jti }, - transaction - }) - } + const token = await models.FogUsedToken.findOne({ + where: { jti }, + transaction + }) return !!token } catch (error) { logger.error(`Failed to check JTI: ${error.message}`) @@ -80,19 +51,18 @@ class FogUsedTokenManager { } } - /** - * Clean up expired JTIs - * @returns {Promise} Number of deleted tokens - */ - static async cleanupExpiredJtis () { + static async cleanupExpiredJtis (transaction) { + AppHelper.checkTransaction(transaction) + try { - const now = Math.floor(Date.now() / 1000) // Convert to Unix timestamp (seconds) + const now = Math.floor(Date.now() / 1000) const result = await models.FogUsedToken.destroy({ where: { expiryTime: { [Op.lt]: now } - } + }, + transaction }) logger.debug(`Cleaned up ${result} expired JTIs`) return result diff --git a/src/data/managers/hub-router-config-lock-manager.js b/src/data/managers/hub-router-config-lock-manager.js index 6b7d64ea..687fcff0 100644 --- a/src/data/managers/hub-router-config-lock-manager.js +++ b/src/data/managers/hub-router-config-lock-manager.js @@ -1,6 +1,5 @@ const BaseManager = require('./base-manager') const models = require('../models') -const databaseProvider = require('../providers/database-factory') const config = require('../../config') const LOCK_ROW_ID = 1 @@ -10,12 +9,6 @@ class HubRouterConfigLockManager extends BaseManager { return models.HubRouterConfigLock } - _getModelOptions (transaction) { - return transaction && transaction.fakeTransaction - ? {} - : { transaction } - } - _isUniqueConstraintError (error) { return error && error.name === 'SequelizeUniqueConstraintError' } @@ -41,12 +34,6 @@ class HubRouterConfigLockManager extends BaseManager { } async tryAcquire (controllerUuid, timeoutSeconds, transaction) { - if (transaction.fakeTransaction) { - return databaseProvider.sequelize.transaction((t) => - this.tryAcquire(controllerUuid, timeoutSeconds, t) - ) - } - await this.initializeLock(transaction) const stalenessSeconds = this._getStalenessSeconds(timeoutSeconds) diff --git a/src/data/managers/iofog-manager.js b/src/data/managers/iofog-manager.js index 988a0a78..f328640f 100644 --- a/src/data/managers/iofog-manager.js +++ b/src/data/managers/iofog-manager.js @@ -62,14 +62,14 @@ class FogManager extends BaseManager { }) } - // no transaction required here, used by agent-last-active decorator - updateLastActive (uuid, timestamp) { + updateLastActive (uuid, timestamp, transaction) { return Fog.update({ lastActive: timestamp }, { where: { uuid - } + }, + transaction }) } } diff --git a/src/data/managers/iofog-public-key-manager.js b/src/data/managers/iofog-public-key-manager.js index 7e88a80e..ae5069a9 100644 --- a/src/data/managers/iofog-public-key-manager.js +++ b/src/data/managers/iofog-public-key-manager.js @@ -7,71 +7,40 @@ class FogPublicKeyManager extends BaseManager { return FogPublicKey } - // Find public key by fog UUID findByFogUuid (fogUuid, transaction) { - const options = transaction.fakeTransaction - ? { - where: { - iofogUuid: fogUuid - } - } - : { - where: { - iofogUuid: fogUuid - }, - transaction - } - - return FogPublicKey.findOne(options) + return FogPublicKey.findOne({ + where: { + iofogUuid: fogUuid + }, + transaction + }) } - // Update or create public key for a fog updateOrCreate (fogUuid, publicKey, transaction) { - const options = transaction.fakeTransaction - ? { - where: { - iofogUuid: fogUuid - } - } - : { + return FogPublicKey.findOne({ + where: { + iofogUuid: fogUuid + }, + transaction + }).then((existingKey) => { + if (existingKey) { + return FogPublicKey.update({ + publicKey + }, { where: { iofogUuid: fogUuid }, transaction - } - - return FogPublicKey.findOne(options).then((existingKey) => { - if (existingKey) { - const updateOptions = transaction.fakeTransaction - ? { - where: { - iofogUuid: fogUuid - } - } - : { - where: { - iofogUuid: fogUuid - }, - transaction - } - - return FogPublicKey.update({ - publicKey - }, updateOptions) - } else { - const createOptions = transaction.fakeTransaction - ? {} - : { transaction } - - return FogPublicKey.create({ - iofogUuid: fogUuid, - publicKey - }, createOptions) + }) } + + return FogPublicKey.create({ + iofogUuid: fogUuid, + publicKey + }, { transaction }) }) } - // Delete public key by fog UUID deleteByFogUuid (fogUuid, transaction) { return this.delete({ iofogUuid: fogUuid }, transaction) } diff --git a/src/data/managers/nats-connection-manager.js b/src/data/managers/nats-connection-manager.js index 94d6fa46..83346863 100644 --- a/src/data/managers/nats-connection-manager.js +++ b/src/data/managers/nats-connection-manager.js @@ -22,8 +22,9 @@ class NatsConnectionManager extends BaseManager { required: true } ], - where - }, { transaction }) + where, + transaction + }) } } diff --git a/src/data/managers/nats-instance-manager.js b/src/data/managers/nats-instance-manager.js index 2703185e..29eabb5b 100644 --- a/src/data/managers/nats-instance-manager.js +++ b/src/data/managers/nats-instance-manager.js @@ -8,7 +8,10 @@ class NatsInstanceManager extends BaseManager { } findByFog (iofogUuid, transaction) { - return NatsInstance.findOne({ where: { iofogUuid } }, { transaction }) + return NatsInstance.findOne({ + where: { iofogUuid }, + transaction + }) } } diff --git a/src/data/managers/rbac-cache-version-manager.js b/src/data/managers/rbac-cache-version-manager.js index 3ad6013e..b92fbd07 100644 --- a/src/data/managers/rbac-cache-version-manager.js +++ b/src/data/managers/rbac-cache-version-manager.js @@ -8,11 +8,6 @@ class RbacCacheVersionManager extends BaseManager { return RbacCacheVersion } - /** - * Get current cache version - * @param {Object} transaction - Database transaction - * @returns {Promise} Current version number - */ async getVersion (transaction) { const cacheVersion = await this.findOne({ id: 1 }, transaction) if (!cacheVersion) { @@ -22,9 +17,7 @@ class RbacCacheVersionManager extends BaseManager { } _getModelOptions (transaction) { - return transaction && transaction.fakeTransaction - ? {} - : { transaction } + return { transaction } } _extractAffectedRows (updateResult) { @@ -79,12 +72,6 @@ class RbacCacheVersionManager extends BaseManager { } } - /** - * Increment cache version - * This should be called whenever any RBAC resource (Role, RoleBinding, ServiceAccount) is modified - * @param {Object} transaction - Database transaction - * @returns {Promise} - */ async incrementVersion (transaction) { try { const updateResult = await this._incrementVersionAtomic(transaction) @@ -113,16 +100,9 @@ class RbacCacheVersionManager extends BaseManager { } } - /** - * Initialize cache version row if it doesn't exist - * This is called on server startup to ensure the row exists - * @param {Object} transaction - Database transaction (optional) - * @returns {Promise} - */ async initializeVersion (transaction) { const cacheVersion = await this.findOne({ id: 1 }, transaction) if (!cacheVersion) { - // Create initial version row await this.create({ id: 1, version: 1 }, transaction) } } diff --git a/src/data/managers/rbac-role-manager.js b/src/data/managers/rbac-role-manager.js index 69b16e47..1ed8dc01 100644 --- a/src/data/managers/rbac-role-manager.js +++ b/src/data/managers/rbac-role-manager.js @@ -104,8 +104,7 @@ class RbacRoleManager extends BaseManager { resourceNames: rule.resourceNames || null })) - const bulkCreateOptions = transaction.fakeTransaction ? {} : { transaction } - await RbacRoleRule.bulkCreate(rules, bulkCreateOptions) + await RbacRoleRule.bulkCreate(rules, { transaction }) } // Increment cache version to invalidate caches on all instances @@ -145,10 +144,10 @@ class RbacRoleManager extends BaseManager { // Update rules if provided if (roleData.rules && Array.isArray(roleData.rules)) { // Delete existing rules - const destroyOptions = transaction.fakeTransaction - ? { where: { roleId: role.id } } - : { where: { roleId: role.id }, transaction } - await RbacRoleRule.destroy(destroyOptions) + await RbacRoleRule.destroy({ + where: { roleId: role.id }, + transaction + }) // Create new rules const rules = roleData.rules.map(rule => ({ @@ -159,8 +158,7 @@ class RbacRoleManager extends BaseManager { resourceNames: rule.resourceNames || null })) - const bulkCreateOptions = transaction.fakeTransaction ? {} : { transaction } - await RbacRoleRule.bulkCreate(rules, bulkCreateOptions) + await RbacRoleRule.bulkCreate(rules, { transaction }) } // Increment cache version to invalidate caches on all instances @@ -215,10 +213,10 @@ class RbacRoleManager extends BaseManager { return null } - const findAllOptions = transaction.fakeTransaction - ? { where: { roleId: role.id } } - : { where: { roleId: role.id }, transaction } - const rules = await RbacRoleRule.findAll(findAllOptions) + const rules = await RbacRoleRule.findAll({ + where: { roleId: role.id }, + transaction + }) return { id: role.id, diff --git a/src/data/managers/rbac-service-account-manager.js b/src/data/managers/rbac-service-account-manager.js index 5adc6475..0c2b4571 100644 --- a/src/data/managers/rbac-service-account-manager.js +++ b/src/data/managers/rbac-service-account-manager.js @@ -54,10 +54,11 @@ class RbacServiceAccountManager extends BaseManager { if (!application) { throw new Errors.NotFoundError(`Application '${appName}' not found`) } - const options = transaction.fakeTransaction - ? { where: { applicationId: application.id, name }, include: serviceAccountIncludeApplication } - : { where: { applicationId: application.id, name }, include: serviceAccountIncludeApplication, transaction } - const sa = await RbacServiceAccount.findOne(options) + const sa = await RbacServiceAccount.findOne({ + where: { applicationId: application.id, name }, + include: serviceAccountIncludeApplication, + transaction + }) return sa ? mapToResponse(sa) : null } @@ -168,7 +169,7 @@ class RbacServiceAccountManager extends BaseManager { const updated = await RbacServiceAccount.findByPk(sa.id, { include: serviceAccountIncludeApplication, - transaction: transaction.fakeTransaction ? undefined : transaction + transaction }) return mapToResponse(updated) } @@ -216,10 +217,11 @@ class RbacServiceAccountManager extends BaseManager { } where.applicationId = application.id } - const findOptions = transaction.fakeTransaction - ? { where, include: serviceAccountIncludeApplication } - : { where, include: serviceAccountIncludeApplication, transaction } - const list = await RbacServiceAccount.findAll(findOptions) + const list = await RbacServiceAccount.findAll({ + where, + include: serviceAccountIncludeApplication, + transaction + }) return list.map(sa => mapToResponse(sa)) } } diff --git a/src/data/managers/registry-manager.js b/src/data/managers/registry-manager.js index 4bd066fd..3eb490d0 100644 --- a/src/data/managers/registry-manager.js +++ b/src/data/managers/registry-manager.js @@ -1,6 +1,6 @@ const BaseManager = require('./base-manager') -const SecretHelper = require('../../helpers/secret-helper') const vaultManager = require('../../vault/vault-manager') +const { scheduleVaultDeleteAfterCommit } = require('../../helpers/vault-transaction-helper') const models = require('../models') const Registry = models.Registry @@ -11,14 +11,11 @@ class RegistryManager extends BaseManager { async delete (data, transaction) { const registry = await this.findOne(data || {}, transaction) + const result = await super.delete(data, transaction) if (registry && vaultManager.isEnabled()) { - try { - await SecretHelper.deleteSecret('registry-' + registry.id, 'registry') - } catch (err) { - // Ignore 404 or other errors (e.g. password was never stored in vault) - } + scheduleVaultDeleteAfterCommit(transaction, 'registry-' + registry.id, 'registry') } - return super.delete(data, transaction) + return result } } diff --git a/src/data/managers/secret-manager.js b/src/data/managers/secret-manager.js index b75ca337..0253a20f 100644 --- a/src/data/managers/secret-manager.js +++ b/src/data/managers/secret-manager.js @@ -1,5 +1,4 @@ const BaseManager = require('./base-manager') -const SecretHelper = require('../../helpers/secret-helper') const models = require('../models') const Secret = models.Secret @@ -18,12 +17,16 @@ class SecretManager extends BaseManager { } async updateSecret (name, type, data, transaction) { - const encryptedData = await SecretHelper.encryptSecret(data, name, type) - return this.update( - { name }, - { type, data: encryptedData }, - transaction - ) + const existing = await this.findOne({ name }, transaction) + if (!existing) { + throw new Error(`Secret ${name} not found`) + } + + existing.type = type + existing.data = data + await existing.save({ transaction }) + + return existing } async getSecret (name, transaction) { diff --git a/src/data/managers/volume-mapping-manager.js b/src/data/managers/volume-mapping-manager.js index 7cc1d8bb..a4c40f27 100644 --- a/src/data/managers/volume-mapping-manager.js +++ b/src/data/managers/volume-mapping-manager.js @@ -10,8 +10,9 @@ class VolumeMappingManager extends BaseManager { findAll (where, transaction) { return VolumeMapping.findAll({ where, - attributes: ['hostDestination', 'containerDestination', 'accessMode', 'id', 'type'] - }, { transaction }) + attributes: ['hostDestination', 'containerDestination', 'accessMode', 'id', 'type'], + transaction + }) } } diff --git a/src/data/managers/volume-mounting-manager.js b/src/data/managers/volume-mounting-manager.js index 8c1b5487..63c71859 100644 --- a/src/data/managers/volume-mounting-manager.js +++ b/src/data/managers/volume-mounting-manager.js @@ -19,29 +19,33 @@ class VolumeMountingManager extends BaseManager { getAll (where, transaction) { return VolumeMount.findAll({ where, - attributes: ['uuid', 'name', 'configMapName', 'secretName'] - }, { transaction }) + attributes: ['uuid', 'name', 'configMapName', 'secretName'], + transaction + }) } getOne (where, transaction) { return VolumeMount.findOne({ where, - attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'] - }, { transaction }) + attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'], + transaction + }) } findOne (where, transaction) { return VolumeMount.findOne({ where, - attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'] - }, { transaction }) + attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'], + transaction + }) } findAll (where, transaction) { return VolumeMount.findAll({ where, - attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'] - }, { transaction }) + attributes: ['uuid', 'name', 'configMapName', 'secretName', 'version'], + transaction + }) } } From 9895b1e3d7dcdb9c432ae73b0e861883fdfacb94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:40:25 +0300 Subject: [PATCH 07/32] Split fog, service, and NATS reconcile into phased transactions. Defer K8s and Vault I/O until after commit, route enqueue through ReconcileOutbox, unwrap internal iofog helpers, and remove unused services-service TCP bridge K8s paths. --- src/helpers/secret-helper.js | 55 +- src/helpers/template-helper.js | 6 +- src/services/certificate-service.js | 30 +- src/services/config-map-service.js | 1 - src/services/fog-platform-service.js | 351 ++++++++----- src/services/iofog-service.js | 60 ++- src/services/nats-api-service.js | 34 +- src/services/nats-auth-service.js | 96 ++-- src/services/nats-service.js | 390 +++++++++++--- src/services/registry-service.js | 24 +- src/services/router-service.js | 5 + src/services/secret-service.js | 22 +- src/services/service-bridge-config.js | 9 +- src/services/service-platform-service.js | 414 ++++++++++----- src/services/services-service.js | 632 +++-------------------- src/utils/cert.js | 66 ++- 16 files changed, 1123 insertions(+), 1072 deletions(-) diff --git a/src/helpers/secret-helper.js b/src/helpers/secret-helper.js index 1eca13dc..59fd4538 100644 --- a/src/helpers/secret-helper.js +++ b/src/helpers/secret-helper.js @@ -21,39 +21,22 @@ class SecretHelper { * @param {boolean} useVault - For ConfigMaps: whether to use vault (optional, defaults to true if vault enabled) * @returns {Promise} - Returns encrypted data or vault reference */ - async encryptSecret (secretData, secretName, secretType = null, useVault = null) { + _shouldUseVault (secretType, useVault) { const isConfigMap = secretType === 'configmap' - // Determine if vault should be used - let shouldUseVault = false - if (isConfigMap) { - // For ConfigMaps, check the useVault parameter if (useVault === false) { - // Explicitly disabled - use internal encryption - shouldUseVault = false - } else if (useVault === true || useVault === null) { - // Explicitly enabled or default (null) - use vault if enabled - shouldUseVault = vaultManager.isEnabled() + return false } - } else { - // For non-ConfigMaps (Secrets, Agent Auth Keys), always use vault if enabled - shouldUseVault = vaultManager.isEnabled() - } - - // If vault should be used, store in vault - if (shouldUseVault) { - try { - const vaultPath = await vaultManager.store(secretName, secretType, secretData) - // Return vault reference that will be stored in database - return `${this.VAULT_REF_PREFIX}${vaultPath}` - } catch (error) { - logger.error(`Failed to store secret in vault: ${error.message}`) - throw error + if (useVault === true || useVault === null) { + return vaultManager.isEnabled() } } - // Fallback to internal encryption + return vaultManager.isEnabled() + } + + async encryptSecretInternal (secretData, secretName) { const salt = crypto.randomBytes(this.SALT_LENGTH) const key = await this._deriveKey(secretName, salt) const iv = crypto.randomBytes(this.IV_LENGTH) @@ -66,6 +49,28 @@ class SecretHelper { return Buffer.concat([salt, iv, tag, encrypted]).toString('base64') } + async storeInVaultAndGetReference (secretData, secretName, secretType = null, useVault = null) { + if (!this._shouldUseVault(secretType, useVault)) { + throw new Error('Vault storage requested but vault is not configured for this resource') + } + + try { + const vaultPath = await vaultManager.store(secretName, secretType, secretData) + return `${this.VAULT_REF_PREFIX}${vaultPath}` + } catch (error) { + logger.error(`Failed to store secret in vault: ${error.message}`) + throw error + } + } + + async encryptSecret (secretData, secretName, secretType = null, useVault = null) { + if (this._shouldUseVault(secretType, useVault)) { + return this.storeInVaultAndGetReference(secretData, secretName, secretType, useVault) + } + + return this.encryptSecretInternal(secretData, secretName) + } + /** * Retrieve secret data - uses vault if reference detected, otherwise uses internal decryption * @param {string} encryptedData - Encrypted data or vault reference diff --git a/src/helpers/template-helper.js b/src/helpers/template-helper.js index 725381ba..b4d6354d 100755 --- a/src/helpers/template-helper.js +++ b/src/helpers/template-helper.js @@ -1,6 +1,7 @@ const ApplicationManager = require('../data/managers/application-manager.js') // Using manager instead of service to avoid dependency loop const FogService = require('../services/iofog-service') const MicroservicesService = require('../services/microservices-service') +const { runInTransaction } = require('../helpers/transaction-runner') // ninja2 like template engine const { Liquid } = require('../lib/liquidjs/liquid.node.cjs') @@ -29,7 +30,10 @@ async function findApplicationHandler (name) { return this.context.environments._applicationsByName[name] } - const result = await ApplicationManager.findOnePopulated({ exclude: ['created_at', 'updated_at'] }, { fakeTransaction: true }) // TODO: Get a proper DB transaction + const result = await runInTransaction( + (transaction) => ApplicationManager.findOnePopulated({ exclude: ['created_at', 'updated_at'] }, transaction), + { label: 'template-find-application' } + ) if (result) { result.microservices = (await MicroservicesService.listMicroservicesEndPoint({ applicationName: name }, false)).microservices if (this.context.environments._applicationsByName) { diff --git a/src/services/certificate-service.js b/src/services/certificate-service.js index ea4983f6..da7694f0 100644 --- a/src/services/certificate-service.js +++ b/src/services/certificate-service.js @@ -80,7 +80,7 @@ async function createCAEndpoint (caData, transaction) { try { const secretName = caData.type === 'self-signed' ? caData.name : caData.secretName - const existingSecret = await SecretService.getSecretEndpoint(secretName) + const existingSecret = await SecretService.getSecretEndpoint(secretName, transaction) if (caData.type === 'self-signed') { if (existingSecret) { throw new Errors.ConflictError(`CA with name ${secretName} already exists`) @@ -109,20 +109,20 @@ async function createCAEndpoint (caData, transaction) { if (caData.type === 'self-signed') { ca = await generateSelfSignedCA(caData.subject, caData.expiration) - await storeCA(ca, caData.name) + await storeCA(ca, caData.name, transaction) certDetails = parseCertificate(ca.cert) } else if (caData.type === 'k8s-secret') { // Import CA from Kubernetes secret - ca = await require('../utils/cert').getCAFromK8sSecret(caData.secretName) + ca = await require('../utils/cert').getCAFromK8sSecret(caData.secretName, transaction) certDetails = parseCertificate(ca.certificate) // Store the CA locally with the same name as the secret const checkedSecret = await SecretManager.findOne({ name: caData.secretName || caData.name }, transaction) if (!checkedSecret) { - await storeCA({ cert: ca.certificate, key: ca.key }, caData.secretName) + await storeCA({ cert: ca.certificate, key: ca.key }, caData.secretName, transaction) } } else if (caData.type === 'direct') { // Load from internal secret - const caObj = await require('../utils/cert').loadCA(caData.secretName) + const caObj = await require('../utils/cert').loadCA(caData.secretName, transaction) ca = await require('../utils/cert').getCAFromDirect(caObj) certDetails = parseCertificate(ca.certificate) } else { @@ -192,7 +192,7 @@ async function getCAEndpoint (name, transaction) { } // Get the actual cert data from the secret - const secret = await SecretService.getSecretEndpoint(name) + const secret = await SecretService.getSecretEndpoint(name, transaction) if (!secret || secret.type !== 'tls') { throw new Errors.NotFoundError(`CA with name ${name} not found`) @@ -280,7 +280,7 @@ async function _createCertificateEndpointInner (certData, transaction) { // Check if certificate already exists try { - const existingSecret = await SecretService.getSecretEndpoint(certData.name) + const existingSecret = await SecretService.getSecretEndpoint(certData.name, transaction) if (existingSecret) { throw new Errors.ConflictError(`Certificate with name ${certData.name} already exists`) } @@ -310,11 +310,12 @@ async function _createCertificateEndpointInner (certData, transaction) { subject: certData.subject, hosts: certData.hosts, expiration: certData.expiration, - ca: certData.ca + ca: certData.ca, + transaction }) // Get certificate details from newly created secret - const certSecret = await SecretService.getSecretEndpoint(certData.name) + const certSecret = await SecretService.getSecretEndpoint(certData.name, transaction) const certPem = Buffer.from(certSecret.data['tls.crt'], 'base64').toString() const certDetails = parseCertificate(certPem) @@ -367,7 +368,8 @@ async function _createCertificateEndpointInner (certData, transaction) { subject: certData.subject, hosts: certData.hosts, expiration: certData.expiration, - ca: certData.ca + ca: certData.ca, + transaction }) } catch (error) { logger.error(`Failed to generate certificate ${certData.name}:`, error.message) @@ -375,7 +377,7 @@ async function _createCertificateEndpointInner (certData, transaction) { } // Get certificate from secret to parse details - const certSecret = await SecretService.getSecretEndpoint(certData.name) + const certSecret = await SecretService.getSecretEndpoint(certData.name, transaction) const certPem = Buffer.from(certSecret.data['tls.crt'], 'base64').toString() const certDetails = parseCertificate(certPem) @@ -409,7 +411,7 @@ async function getCertificateEndpoint (name, transaction) { } // Get the actual cert data from the secret - const secret = await SecretService.getSecretEndpoint(name) + const secret = await SecretService.getSecretEndpoint(name, transaction) if (!secret || secret.type !== 'tls') { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.CERTIFICATE_NOT_FOUND, name)) @@ -508,7 +510,7 @@ async function renewCertificateEndpoint (name, transaction) { // Delete existing secret (if any) - we'll create a new one try { - await SecretService.deleteSecretEndpoint(name) + await SecretService.deleteSecretEndpoint(name, transaction) } catch (error) { // Ignore NotFoundError if (!(error instanceof Errors.NotFoundError)) { @@ -548,7 +550,7 @@ async function renewCertificateEndpoint (name, transaction) { } // Generate new certificate - await generateCertificate(renewalData) + await generateCertificate({ ...renewalData, transaction }) // Get the newly created secret const secretModel = await SecretManager.findOne({ name }, transaction) diff --git a/src/services/config-map-service.js b/src/services/config-map-service.js index 0c6ddd9f..767be600 100644 --- a/src/services/config-map-service.js +++ b/src/services/config-map-service.js @@ -120,7 +120,6 @@ async function deleteConfigMapEndpoint (configMapName, transaction) { await ConfigMapManager.deleteConfigMap(configMapName, transaction) await _deleteVolumeMountsUsingConfigMap(configMapName, transaction) - // Vault deletion is handled by ConfigMapManager.deleteConfigMap() return {} } diff --git a/src/services/fog-platform-service.js b/src/services/fog-platform-service.js index 337c647f..533e996c 100644 --- a/src/services/fog-platform-service.js +++ b/src/services/fog-platform-service.js @@ -13,8 +13,11 @@ const NatsConnectionManager = require('../data/managers/nats-connection-manager' const ChangeTrackingService = require('./change-tracking-service') const IofogService = require('./iofog-service') const NatsService = require('./nats-service') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const RouterService = require('./router-service') const ServiceBridgeConfig = require('./service-bridge-config') +const transactionRunner = require('../helpers/transaction-runner') +const { PRIORITY_BACKGROUND } = transactionRunner const logger = require('../logger') function buildFogDataFromSpecAndFog (fog, spec) { @@ -146,155 +149,237 @@ function buildReadyConditions (spec, router, nats) { ] } -async function reconcileFog (fogUuid, transaction) { - const startedAt = Date.now() - let generation = null - let phase = 'Progressing' +async function reconcileFogPrepare (fogUuid, transaction) { + const fog = await FogManager.findOneWithTags({ uuid: fogUuid }, transaction) + if (!fog) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogUuid)) + } - try { - const fog = await FogManager.findOneWithTags({ uuid: fogUuid }, transaction) - if (!fog) { - throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogUuid)) - } + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) + if (!parsedSpec) { + throw new Errors.NotFoundError(`Fog platform spec not found for fog ${fogUuid}`) + } - const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) - if (!parsedSpec) { - throw new Errors.NotFoundError(`Fog platform spec not found for fog ${fogUuid}`) + const status = await FogPlatformStatusManager.getParsedStatus(fogUuid, transaction) + if (status && status.phase === 'Deleting') { + return { + skipped: true, + reason: 'deleting', + generation: parsedSpec.generation, + phase: status.phase } + } - const status = await FogPlatformStatusManager.getParsedStatus(fogUuid, transaction) - if (status && status.phase === 'Deleting') { - logger.info('fogPlatformReconcile skipped delete-owned fog', { - fogUuid, - generation: parsedSpec.generation, - phase: status.phase, - durationMs: Date.now() - startedAt - }) - return { skipped: true, reason: 'deleting' } - } + const generation = parsedSpec.generation + const spec = parsedSpec.spec + const fogData = buildFogDataFromSpecAndFog(fog, spec) + const topologyBefore = await captureTopologySnapshot(fogUuid, transaction) - generation = parsedSpec.generation - const spec = parsedSpec.spec - const fogData = buildFogDataFromSpecAndFog(fog, spec) - const topologyBefore = await captureTopologySnapshot(fogUuid, transaction) + await FogPlatformStatusManager.setPhase(fogUuid, 'Progressing', { lastError: null }, transaction) + validateSystemFogInvariants(fog, spec) - await FogPlatformStatusManager.setPhase(fogUuid, 'Progressing', {}, transaction) - validateSystemFogInvariants(fog, spec) + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const oldRouterMode = router ? (router.isEdge ? 'edge' : 'interior') : 'none' + const isRouterModeChanged = spec.routerMode !== oldRouterMode && + (spec.routerMode === 'none' || oldRouterMode === 'none') + const isHostChanged = spec.host != null && spec.host !== fog.host + const shouldRecreateCerts = isRouterModeChanged || isHostChanged - const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) - const oldRouterMode = router ? (router.isEdge ? 'edge' : 'interior') : 'none' - const isRouterModeChanged = spec.routerMode !== oldRouterMode && - (spec.routerMode === 'none' || oldRouterMode === 'none') - const isHostChanged = spec.host != null && spec.host !== fog.host - const shouldRecreateCerts = isRouterModeChanged || isHostChanged + return { + fog, + spec, + fogData, + generation, + topologyBefore, + shouldRecreateCerts, + isHostChanged, + natsConfig: buildNatsConfig(spec), + isFirstReconcile: !status || status.observedGeneration === 0, + router + } +} - await IofogService._handleRouterCertificates(fogData, fogUuid, shouldRecreateCerts, transaction) - if (shouldRecreateCerts) { - await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.volumeMounts, transaction) - } +async function reconcileFogCertPrep (fogUuid, prep) { + await transactionRunner.runInTransaction( + (transaction) => IofogService._handleRouterCertificates( + prep.fogData, + fogUuid, + prep.shouldRecreateCerts, + transaction + ), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.certPrep' } + ) + + if (prep.shouldRecreateCerts) { + await transactionRunner.runInTransaction( + (transaction) => ChangeTrackingService.update( + fogUuid, + ChangeTrackingService.events.volumeMounts, + transaction + ), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.certPrepVolumeMounts' } + ) + } - const natsConfig = buildNatsConfig(spec) - if (spec.natsMode === 'none') { - await NatsService.cleanupNatsForFog(fog, transaction) - await IofogService._deleteNatsMicroserviceByFog(fogData, transaction) + if (prep.isHostChanged && prep.spec.natsMode !== 'none') { + await transactionRunner.runInTransaction( + (transaction) => IofogService._reconcileNatsCertificatesOnHostChange(prep.fog, transaction), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.certPrepNatsHost' } + ) + } +} + +async function reconcileFogNats (fogUuid, prep) { + if (prep.spec.natsMode === 'none') { + await NatsService.cleanupNatsForFogPhased(prep.fog) + await transactionRunner.runInTransaction(async (transaction) => { + await IofogService._deleteNatsMicroserviceByFog(prep.fogData, transaction) await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceList, transaction) - } else { - if (isHostChanged) { - await IofogService._reconcileNatsCertificatesOnHostChange(fog, transaction) - } - await NatsService.ensureNatsForFog(fog, natsConfig, transaction) - } + }, { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.natsCleanup' }) + } else { + await NatsService.ensureNatsForFogPhased(prep.fog, prep.natsConfig) + } +} - let networkRouter = null - if (spec.routerMode === 'none') { - networkRouter = await RouterService.getNetworkRouter(spec.networkRouter, transaction) - if (!networkRouter) { - throw new Errors.NotFoundError(AppHelper.formatMessage( - ErrorMessages.INVALID_ROUTER, - spec.networkRouter || Constants.DEFAULT_ROUTER_NAME - )) - } - if (router) { - await IofogService._deleteFogRouter(fogData, transaction) - } - await FogManager.update({ uuid: fogUuid }, { routerId: networkRouter.id }, transaction) +async function reconcileFogPlatform (fogUuid, prep, transaction) { + const { fog, spec, fogData, router } = prep + let networkRouter = null + + if (spec.routerMode === 'none') { + networkRouter = await RouterService.getNetworkRouter(spec.networkRouter, transaction) + if (!networkRouter) { + throw new Errors.NotFoundError(AppHelper.formatMessage( + ErrorMessages.INVALID_ROUTER, + spec.networkRouter || Constants.DEFAULT_ROUTER_NAME + )) + } + if (router) { + await IofogService._deleteFogRouter(fogData, transaction) + } + await FogManager.update({ uuid: fogUuid }, { routerId: networkRouter.id }, transaction) + } else { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const upstreamConnections = router + ? await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) + : [] + const upstreamRoutersIofogUuid = spec.upstreamRouters || (upstreamConnections || []) + .map((connection) => connection.dest.iofogUuid) + const upstreamRouters = await RouterService.validateAndReturnUpstreamRouters( + upstreamRoutersIofogUuid, + fog.isSystem, + defaultRouter, + transaction + ) + + const host = spec.host || (router ? router.host : null) + if (!router) { + networkRouter = await RouterService.createRouterForFog(fogData, fogUuid, upstreamRouters, transaction) } else { - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - const upstreamConnections = router - ? await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) - : [] - const upstreamRoutersIofogUuid = spec.upstreamRouters || (upstreamConnections || []) - .map((connection) => connection.dest.iofogUuid) - const upstreamRouters = await RouterService.validateAndReturnUpstreamRouters( - upstreamRoutersIofogUuid, - fog.isSystem, - defaultRouter, - transaction - ) - - const host = spec.host || (router ? router.host : null) - if (!router) { - networkRouter = await RouterService.createRouterForFog(fogData, fogUuid, upstreamRouters, transaction) - } else { - networkRouter = await RouterService.updateRouter(router, { - messagingPort: spec.messagingPort || router.messagingPort, - interRouterPort: spec.interRouterPort || router.interRouterPort, - edgeRouterPort: spec.edgeRouterPort || router.edgeRouterPort, - isEdge: spec.routerMode === 'edge', - host - }, upstreamRouters, spec.containerEngine || fog.containerEngine, transaction) - } - - const baseRouterConfig = await IofogService._getRouterMicroserviceConfig(fogUuid, transaction) - await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseRouterConfig, transaction) + networkRouter = await RouterService.updateRouter(router, { + messagingPort: spec.messagingPort || router.messagingPort, + interRouterPort: spec.interRouterPort || router.interRouterPort, + edgeRouterPort: spec.edgeRouterPort || router.edgeRouterPort, + isEdge: spec.routerMode === 'edge', + host + }, upstreamRouters, spec.containerEngine || fog.containerEngine, transaction) } - if (spec.host && spec.host !== fog.host) { - await IofogService._updateMicroserviceExtraHosts(fogUuid, spec.host, transaction) - } + const baseRouterConfig = await IofogService._getRouterMicroserviceConfig(fogUuid, transaction) + await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseRouterConfig, transaction) + } - if (fog.abstractedHardwareEnabled === true && spec.abstractedHardwareEnabled === false) { - await IofogService._deleteHalMicroserviceByFog(fogData, transaction) - await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) - } else if (fog.abstractedHardwareEnabled === false && spec.abstractedHardwareEnabled === true) { - await IofogService._createHalMicroserviceForFog(fogData, fog, transaction) - await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) - } + if (spec.host && spec.host !== fog.host) { + await IofogService._updateMicroserviceExtraHosts(fogUuid, spec.host, transaction) + } - if (fog.bluetoothEnabled === true && spec.bluetoothEnabled === false) { - await IofogService._deleteBluetoothMicroserviceByFog(fogData, transaction) - await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) - } else if (fog.bluetoothEnabled === false && spec.bluetoothEnabled === true) { - await IofogService._createBluetoothMicroserviceForFog(fogData, fog, transaction) - await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) - } + if (fog.abstractedHardwareEnabled === true && spec.abstractedHardwareEnabled === false) { + await IofogService._deleteHalMicroserviceByFog(fogData, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } else if (fog.abstractedHardwareEnabled === false && spec.abstractedHardwareEnabled === true) { + await IofogService._createHalMicroserviceForFog(fogData, fog, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } - const isFirstReconcile = !status || status.observedGeneration === 0 - if (isFirstReconcile) { - await ChangeTrackingService.create(fogUuid, transaction) - } + if (fog.bluetoothEnabled === true && spec.bluetoothEnabled === false) { + await IofogService._deleteBluetoothMicroserviceByFog(fogData, transaction) await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } else if (fog.bluetoothEnabled === false && spec.bluetoothEnabled === true) { + await IofogService._createBluetoothMicroserviceForFog(fogData, fog, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } - const routerAfter = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) - const natsAfter = await NatsInstanceManager.findByFog(fogUuid, transaction) - const topologyAfter = await captureTopologySnapshot(fogUuid, transaction) + if (prep.isFirstReconcile) { + await ChangeTrackingService.create(fogUuid, transaction) + } + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) - if (topologyChanged(topologyBefore, topologyAfter)) { - await NatsService.enqueueReconcileTask({ - reason: 'cluster-routes-changed', - fogUuids: [fogUuid] - }, transaction) - } + return { networkRouter } +} - phase = 'Ready' - await FogPlatformStatusManager.setPhase(fogUuid, 'Ready', { - observedGeneration: generation, - lastError: null, - conditions: buildReadyConditions(spec, routerAfter, natsAfter) +async function reconcileFogFinalize (fogUuid, prep, platformResult, transaction) { + const { spec, generation, topologyBefore } = prep + + const routerAfter = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const natsAfter = await NatsInstanceManager.findByFog(fogUuid, transaction) + const topologyAfter = await captureTopologySnapshot(fogUuid, transaction) + + if (topologyChanged(topologyBefore, topologyAfter)) { + await ReconcileOutboxManager.enqueueNats({ + reason: 'cluster-routes-changed', + fogUuids: [fogUuid] }, transaction) + } + + await FogPlatformStatusManager.setPhase(fogUuid, 'Ready', { + observedGeneration: generation, + lastError: null, + conditions: buildReadyConditions(spec, routerAfter, natsAfter) + }, transaction) + + await FogManager.update({ uuid: fogUuid }, { warningMessage: 'HEALTHY' }, transaction) - await FogManager.update({ uuid: fogUuid }, { warningMessage: 'HEALTHY' }, transaction) + return { + networkRouterId: platformResult.networkRouter ? platformResult.networkRouter.id : null + } +} +async function reconcileFog (fogUuid) { + const startedAt = Date.now() + let generation = null + let phase = 'Progressing' + + try { + const prep = await transactionRunner.runInTransaction( + (transaction) => reconcileFogPrepare(fogUuid, transaction), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.prepare' } + ) + + if (prep.skipped) { + logger.info('fogPlatformReconcile skipped delete-owned fog', { + fogUuid, + generation: prep.generation, + phase: prep.phase, + durationMs: Date.now() - startedAt + }) + return { skipped: true, reason: prep.reason } + } + + generation = prep.generation + + await reconcileFogCertPrep(fogUuid, prep) + await reconcileFogNats(fogUuid, prep) + + const platformResult = await transactionRunner.runInTransaction( + (transaction) => reconcileFogPlatform(fogUuid, prep, transaction), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.platform' } + ) + + const finalizeResult = await transactionRunner.runInTransaction( + (transaction) => reconcileFogFinalize(fogUuid, prep, platformResult, transaction), + { priority: PRIORITY_BACKGROUND, label: 'fogPlatform.finalize' } + ) + + phase = 'Ready' logger.info('fogPlatformReconcile completed', { fogUuid, generation, @@ -306,7 +391,7 @@ async function reconcileFog (fogUuid, transaction) { fogUuid, generation, phase, - networkRouterId: networkRouter ? networkRouter.id : null + networkRouterId: finalizeResult.networkRouterId } } catch (error) { logger.error('fogPlatformReconcile failed', { @@ -332,12 +417,15 @@ async function reconcileFogDelete (fogUuid, transaction) { return { skipped: true, reason: 'not-found' } } + logger.info('fogPlatformReconcile delete starting', { fogUuid }) + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) const fogData = parsedSpec ? buildFogDataFromSpecAndFog(fog, parsedSpec.spec) : { uuid: fogUuid, name: fog.name, containerEngine: fog.containerEngine } await IofogService._deleteFogRouter(fogData, transaction) + logger.info('fogPlatformReconcile delete router removed', { fogUuid }) await IofogService._processDeleteCommand(fog, transaction) logger.info('fogPlatformReconcile delete completed', { @@ -349,14 +437,17 @@ async function reconcileFogDelete (fogUuid, transaction) { return { fogUuid, deleted: true } } -const bypassOptions = { bypassQueue: true } - module.exports = { buildFogDataFromSpecAndFog, validateSystemFogInvariants, captureTopologySnapshot, topologyChanged, markReconcileFailed, - reconcileFog: TransactionDecorator.generateTransaction(reconcileFog, bypassOptions), - reconcileFogDelete: TransactionDecorator.generateTransaction(reconcileFogDelete, bypassOptions) + reconcileFogPrepare, + reconcileFogCertPrep, + reconcileFogNats, + reconcileFogPlatform, + reconcileFogFinalize, + reconcileFog, + reconcileFogDelete: TransactionDecorator.generateTransaction(reconcileFogDelete) } diff --git a/src/services/iofog-service.js b/src/services/iofog-service.js index 875d5c22..5c863d9f 100644 --- a/src/services/iofog-service.js +++ b/src/services/iofog-service.js @@ -50,7 +50,7 @@ const FogPublicKeyManager = require('../data/managers/iofog-public-key-manager') const { getServiceAnnotationTag } = require('../config/flavor') const FogPlatformSpecManager = require('../data/managers/fog-platform-spec-manager') const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') -const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const { buildPlatformSpecFromFogData, mergePlatformSpecPatch @@ -413,14 +413,19 @@ async function createFogEndPoint (fogData, isCLI, transaction) { let defaultRouter if (fogData.routerMode === 'none') { - const networkRouter = await RouterService.getNetworkRouter(fogData.networkRouter) + const networkRouter = await RouterService.getNetworkRouter(fogData.networkRouter, transaction) if (!networkRouter) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER, !fogData.networkRouter ? Constants.DEFAULT_ROUTER_NAME : fogData.networkRouter)) } createFogData.routerId = networkRouter.id } else { defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - await RouterService.validateAndReturnUpstreamRouters(fogData.upstreamRouters, fogData.isSystem, defaultRouter) + await RouterService.validateAndReturnUpstreamRouters( + fogData.upstreamRouters, + fogData.isSystem, + defaultRouter, + transaction + ) } const fog = await FogManager.create(createFogData, transaction) @@ -431,7 +436,7 @@ async function createFogEndPoint (fogData, isCLI, transaction) { const platformSpec = buildPlatformSpecFromFogData(fogData, { applyCreateDefaults: true }) const { generation } = await FogPlatformSpecManager.upsertSpec(fog.uuid, platformSpec, transaction) await FogPlatformStatusManager.ensurePending(fog.uuid, transaction) - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + await ReconcileOutboxManager.enqueueFogPlatform({ fogUuid: fog.uuid, reason: 'spec-changed', specGeneration: generation @@ -450,7 +455,7 @@ async function _setTags (fogModel, tagsArray, transaction) { } tags.push(tagModel) } - await fogModel.setTags(tags) + await fogModel.setTags(tags, { transaction }) } } @@ -565,7 +570,7 @@ async function updateFogEndPoint (fogData, isCLI, transaction) { const mergedSpec = mergePlatformSpecPatch(parsedSpec ? parsedSpec.spec : {}, fogData) const { generation } = await FogPlatformSpecManager.upsertSpec(fogData.uuid, mergedSpec, transaction) await FogPlatformStatusManager.ensurePending(fogData.uuid, transaction) - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + await ReconcileOutboxManager.enqueueFogPlatform({ fogUuid: fogData.uuid, reason: 'spec-changed', specGeneration: generation @@ -659,7 +664,7 @@ async function deleteFogEndPoint (fogData, isCLI, transaction) { } await FogPlatformStatusManager.setPhase(fogData.uuid, 'Deleting', {}, transaction) - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + await ReconcileOutboxManager.enqueueFogPlatform({ fogUuid: fogData.uuid, reason: 'delete' }, transaction) @@ -679,7 +684,7 @@ async function reconcileFogEndpoint (fogData, transaction) { } const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogData.uuid, transaction) - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + await ReconcileOutboxManager.enqueueFogPlatform({ fogUuid: fogData.uuid, reason: 'manual-retry', specGeneration: parsedSpec ? parsedSpec.generation : null @@ -1063,6 +1068,8 @@ function _filterFogs (fogs, filters) { } async function _processDeleteCommand (fog, transaction) { + await NatsService.cleanupNatsForFog(fog, transaction) + const microservices = await MicroserviceManager.findAll({ iofogUuid: fog.uuid }, transaction) for (const microservice of microservices) { await MicroserviceService.deleteMicroserviceWithRoutesAndPortMappings(microservice, transaction) @@ -1091,7 +1098,6 @@ async function _processDeleteCommand (fog, transaction) { await SecretManager.delete({ name: secretName }, transaction) } } - await NatsService.cleanupNatsForFog(fog, transaction) const fogPublicKey = await FogPublicKeyManager.findByFogUuid(fog.uuid, transaction) if (fogPublicKey) { await FogKeyService.deletePublicKey(fog.uuid, transaction) @@ -1527,13 +1533,11 @@ async function _updateImages (images, microserviceUuid, transaction) { return _createMicroserviceImages({ uuid: microserviceUuid }, images, transaction) } -const bypassOptions = { bypassQueue: true } - module.exports = { - createFogEndPoint: TransactionDecorator.generateTransaction(createFogEndPoint, bypassOptions), - updateFogEndPoint: TransactionDecorator.generateTransaction(updateFogEndPoint, bypassOptions), - deleteFogEndPoint: TransactionDecorator.generateTransaction(deleteFogEndPoint, bypassOptions), - reconcileFogEndpoint: TransactionDecorator.generateTransaction(reconcileFogEndpoint, bypassOptions), + createFogEndPoint: TransactionDecorator.generateTransaction(createFogEndPoint), + updateFogEndPoint: TransactionDecorator.generateTransaction(updateFogEndPoint), + deleteFogEndPoint: TransactionDecorator.generateTransaction(deleteFogEndPoint), + reconcileFogEndpoint: TransactionDecorator.generateTransaction(reconcileFogEndpoint), getFogEndPoint: TransactionDecorator.generateTransaction(getFogEndPoint), getFogListEndPoint: TransactionDecorator.generateTransaction(getFogListEndPoint), generateProvisioningKeyEndPoint: TransactionDecorator.generateTransaction(generateProvisioningKeyEndPoint), @@ -1547,21 +1551,21 @@ module.exports = { enableNodeExecEndPoint: TransactionDecorator.generateTransaction(enableNodeExecEndPoint), disableNodeExecEndPoint: TransactionDecorator.generateTransaction(disableNodeExecEndPoint), _extractServiceTags, - _findMatchingServices: TransactionDecorator.generateTransaction(_findMatchingServices), + _findMatchingServices, _buildTcpListenerForFog, - _getRouterMicroserviceConfig: TransactionDecorator.generateTransaction(_getRouterMicroserviceConfig), - _extractExistingTcpConnectors: TransactionDecorator.generateTransaction(_extractExistingTcpConnectors), + _getRouterMicroserviceConfig, + _extractExistingTcpConnectors, _mergeTcpConnector, _mergeTcpListener, checkKubernetesEnvironment, - _handleRouterCertificates: TransactionDecorator.generateTransaction(_handleRouterCertificates), - _deleteFogRouter: TransactionDecorator.generateTransaction(_deleteFogRouter), - _processDeleteCommand: TransactionDecorator.generateTransaction(_processDeleteCommand), - _reconcileNatsCertificatesOnHostChange: TransactionDecorator.generateTransaction(_reconcileNatsCertificatesOnHostChange), - _deleteNatsMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteNatsMicroserviceByFog), - _createHalMicroserviceForFog: TransactionDecorator.generateTransaction(_createHalMicroserviceForFog), - _deleteHalMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteHalMicroserviceByFog), - _createBluetoothMicroserviceForFog: TransactionDecorator.generateTransaction(_createBluetoothMicroserviceForFog), - _deleteBluetoothMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteBluetoothMicroserviceByFog), - _updateMicroserviceExtraHosts: TransactionDecorator.generateTransaction(_updateMicroserviceExtraHosts) + _handleRouterCertificates, + _deleteFogRouter, + _processDeleteCommand, + _reconcileNatsCertificatesOnHostChange, + _deleteNatsMicroserviceByFog, + _createHalMicroserviceForFog, + _deleteHalMicroserviceByFog, + _createBluetoothMicroserviceForFog, + _deleteBluetoothMicroserviceByFog, + _updateMicroserviceExtraHosts } diff --git a/src/services/nats-api-service.js b/src/services/nats-api-service.js index 8d07cb98..ab7c7b50 100644 --- a/src/services/nats-api-service.js +++ b/src/services/nats-api-service.js @@ -578,30 +578,28 @@ async function deleteUserRule (ruleName, transaction) { await NatsUserRuleManager.delete({ id: rule.id }, transaction) } -const bypassOptions = { bypassQueue: true } - module.exports = { - getOperator: TransactionDecorator.generateTransaction(getOperator, bypassOptions), - rotateOperator: TransactionDecorator.generateTransaction(rotateOperator, bypassOptions), - getBootstrap: TransactionDecorator.generateTransaction(getBootstrap, bypassOptions), - getHub: TransactionDecorator.generateTransaction(getHub, bypassOptions), - upsertHub: TransactionDecorator.generateTransaction(upsertHub, bypassOptions), + getOperator: TransactionDecorator.generateTransaction(getOperator), + rotateOperator: TransactionDecorator.generateTransaction(rotateOperator), + getBootstrap: TransactionDecorator.generateTransaction(getBootstrap), + getHub: TransactionDecorator.generateTransaction(getHub), + upsertHub: TransactionDecorator.generateTransaction(upsertHub), listAccounts: TransactionDecorator.generateTransaction(listAccounts), getAccount: TransactionDecorator.generateTransaction(getAccount), - ensureAccount: TransactionDecorator.generateTransaction(ensureAccount, bypassOptions), + ensureAccount: TransactionDecorator.generateTransaction(ensureAccount), listAllUsers: TransactionDecorator.generateTransaction(listAllUsers), listUsers: TransactionDecorator.generateTransaction(listUsers), - createUser: TransactionDecorator.generateTransaction(createUser, bypassOptions), + createUser: TransactionDecorator.generateTransaction(createUser), getUserCreds: TransactionDecorator.generateTransaction(getUserCreds), - deleteUser: TransactionDecorator.generateTransaction(deleteUser, bypassOptions), - createMqttBearer: TransactionDecorator.generateTransaction(createMqttBearer, bypassOptions), - deleteMqttBearer: TransactionDecorator.generateTransaction(deleteMqttBearer, bypassOptions), + deleteUser: TransactionDecorator.generateTransaction(deleteUser), + createMqttBearer: TransactionDecorator.generateTransaction(createMqttBearer), + deleteMqttBearer: TransactionDecorator.generateTransaction(deleteMqttBearer), listAccountRules: TransactionDecorator.generateTransaction(listAccountRules), - createAccountRule: TransactionDecorator.generateTransaction(createAccountRule, bypassOptions), - updateAccountRule: TransactionDecorator.generateTransaction(updateAccountRule, bypassOptions), - deleteAccountRule: TransactionDecorator.generateTransaction(deleteAccountRule, bypassOptions), + createAccountRule: TransactionDecorator.generateTransaction(createAccountRule), + updateAccountRule: TransactionDecorator.generateTransaction(updateAccountRule), + deleteAccountRule: TransactionDecorator.generateTransaction(deleteAccountRule), listUserRules: TransactionDecorator.generateTransaction(listUserRules), - createUserRule: TransactionDecorator.generateTransaction(createUserRule, bypassOptions), - updateUserRule: TransactionDecorator.generateTransaction(updateUserRule, bypassOptions), - deleteUserRule: TransactionDecorator.generateTransaction(deleteUserRule, bypassOptions) + createUserRule: TransactionDecorator.generateTransaction(createUserRule), + updateUserRule: TransactionDecorator.generateTransaction(updateUserRule), + deleteUserRule: TransactionDecorator.generateTransaction(deleteUserRule) } diff --git a/src/services/nats-auth-service.js b/src/services/nats-auth-service.js index 30892991..fc4c2d97 100644 --- a/src/services/nats-auth-service.js +++ b/src/services/nats-auth-service.js @@ -12,6 +12,8 @@ const NatsAccountRuleManager = require('../data/managers/nats-account-rule-manag const NatsUserRuleManager = require('../data/managers/nats-user-rule-manager') const MicroserviceManager = require('../data/managers/microservice-manager') const TransactionDecorator = require('../decorators/transaction-decorator') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const logger = require('../logger') const NatsSystemRules = require('../config/nats-system-rules') const { slugifyName } = require('../helpers/system-naming') @@ -225,28 +227,22 @@ function _normalizeSystemUserRuleForPersistence (rule) { } /** - * NATS reconciliation is triggered in two ways: - * (A) From this module: _triggerResolverArtifactsReconcile calls NatsService.enqueueReconcileTask (fire-and-forget). - * Call sites: ensureOperator, rotateOperator, ensureSystemAccount, createUserForAccount, ensureAccountForApplication, - * createAccountForApplication, ensureUserForMicroservice, createMqttBearerUser, ensureLeafSystemAccount, - * reissueAccountForApplication, reissueUserForMicroservice, deleteAccountForApplication, revokeMicroserviceUser, - * reissueForAccountRule, reissueForUserRule, revokeUserByAccountAndName, deleteLeafSystemArtifactsForFog, etc. - * (B) From nats-service: enqueueReconcileTask(..., transaction) inside ensureNatsForFog (cluster-routes-changed) and - * cleanupNatsForFog (server-deleted). - * All API endpoints that trigger reconciliation use the transaction-queue bypass (bypassQueue: true) so requests - * do not wait behind long-running reconcile jobs. + * NATS reconciliation is scheduled via ReconcileOutbox in the same transaction as auth mutations. + * The outbox drainer upserts NatsReconcileTask rows. Cluster-route changes in nats-service also + * enqueue outbox rows in the same transaction. */ -function _triggerResolverArtifactsReconcile (triggerOptions = {}) { +async function _enqueueNatsReconcileOutbox (triggerOptions = {}, transaction) { if (triggerOptions.triggerReconcile === false) { - return + return null } - const NatsService = require('./nats-service') - if (NatsService && typeof NatsService.enqueueReconcileTask === 'function') { - const options = { reason: 'auth-mutation', ...triggerOptions } - NatsService.enqueueReconcileTask(options).catch((err) => { - logger.error(`NATS reconcile enqueue failed: ${err.message}`) - }) + const payload = { reason: 'auth-mutation', ...triggerOptions } + if (transaction) { + return ReconcileOutboxManager.enqueueNats(payload, transaction) } + return runInTransaction( + (tx) => ReconcileOutboxManager.enqueueNats(payload, tx), + { priority: PRIORITY_BACKGROUND, label: 'natsAuth.outboxEnqueue' } + ) } function _runBackgroundTask (label, task) { @@ -289,9 +285,14 @@ async function _upsertOpaqueSecret (name, data, transaction) { } } +const Transaction = require('sequelize/lib/transaction') + function _triggerOptionsFromArgs (args) { const second = args[0] - return (second && typeof second === 'object' && !second.fakeTransaction) ? second : {} + if (second instanceof Transaction) { + return {} + } + return (second && typeof second === 'object') ? second : {} } async function ensureOperator (transaction, ...rest) { @@ -314,7 +315,7 @@ async function ensureOperator (transaction, ...rest) { jwt: operatorJwt, seedSecretName: OPERATOR_SEED_SECRET }, transaction) - _triggerResolverArtifactsReconcile(options) + await _enqueueNatsReconcileOutbox(options, transaction) return created } @@ -357,7 +358,7 @@ async function rotateOperator (transaction) { await NatsAccountManager.update({ id: account.id }, { jwt: newAccountJwt }, transaction) } - _triggerResolverArtifactsReconcile() + await _enqueueNatsReconcileOutbox({}, transaction) return NatsOperatorManager.findOne({ id: existing.id }, transaction) } @@ -391,7 +392,7 @@ async function ensureSystemAccount (transaction, ...rest) { isLeafSystem: false, applicationId: null }, transaction) - _triggerResolverArtifactsReconcile({ ...options, reason: 'system-account-created' }) + await _enqueueNatsReconcileOutbox({ ...options, reason: 'system-account-created' }, transaction) return created } @@ -425,7 +426,7 @@ async function ensureLeafSystemAccount (fog, transaction) { isLeafSystem: true, applicationId: null }, transaction) - _triggerResolverArtifactsReconcile({ fogUuids: [fog.uuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [fog.uuid] }, transaction) return created } @@ -439,7 +440,7 @@ async function ensureSysUserForServer (options = {}, transaction) { ? { user: existingUser } : await createUserForAccount(account.id, sysUserName, null, null, null, transaction) if (created && fog && fog.uuid) { - _triggerResolverArtifactsReconcile({ fogUuids: [fog.uuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [fog.uuid] }, transaction) } return { account, user } } @@ -452,7 +453,7 @@ async function ensureLeafSystemAccountUser (fog, transaction) { return { account, user: existing } } const result = await createUserForAccount(account.id, userName, null, null, null, transaction) - _triggerResolverArtifactsReconcile({ fogUuids: [fog.uuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [fog.uuid] }, transaction) return result } @@ -506,7 +507,7 @@ async function deleteLeafSystemArtifactsForFog (fog, transaction) { } } await NatsAccountManager.delete({ id: account.id }, transaction) - _triggerResolverArtifactsReconcile({ fogUuids: [fog.uuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [fog.uuid] }, transaction) } /** @@ -530,7 +531,7 @@ async function deleteServerSysUserForFog (fog, isHub, transaction) { } } await NatsUserManager.delete({ id: user.id }, transaction) - _triggerResolverArtifactsReconcile({ fogUuids: [fog.uuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [fog.uuid] }, transaction) } async function ensureControllerNatsAccount (transaction, ...rest) { @@ -559,7 +560,7 @@ async function ensureControllerNatsAccount (transaction, ...rest) { null, transaction ) - _triggerResolverArtifactsReconcile(options) + await _enqueueNatsReconcileOutbox(options, transaction) return result } @@ -601,7 +602,7 @@ async function ensureControllerNatsAccount (transaction, ...rest) { null, transaction ) - _triggerResolverArtifactsReconcile(options) + await _enqueueNatsReconcileOutbox(options, transaction) return result } @@ -642,7 +643,7 @@ async function ensureAccountForApplication (applicationId, transaction) { isSystem: false, isLeafSystem: false }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: application.id }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: application.id }, transaction) return created } @@ -692,7 +693,7 @@ async function ensureUserForMicroservice (microservice, transaction) { natsUserRuleId: userRule ? userRule.id : null }, transaction) - _triggerResolverArtifactsReconcile({ fogUuids: [microservice.iofogUuid] }) + await _enqueueNatsReconcileOutbox({ fogUuids: [microservice.iofogUuid] }, transaction) return { account, user: natsUser } } @@ -869,7 +870,7 @@ async function reissueAccountForApplication (applicationId, transaction) { account.jwt ) await NatsAccountManager.update({ id: account.id }, { jwt: accountJwt }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId }, transaction) return NatsAccountManager.findOne({ id: account.id }, transaction) } @@ -913,7 +914,7 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res microserviceUuid: microservice.uuid, natsUserRuleId: currentRuleId }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: microservice.applicationId, ...options }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -922,7 +923,7 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res if (oldAccount) { await _addRevocationToAccount(oldAccount, existingUser.publicKey, transaction) if (options.triggerReconcile !== false && oldAccount.applicationId != null) { - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: oldAccount.applicationId, ...options }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: oldAccount.applicationId, ...options }, transaction) } } const accountSeed = await _loadSeedFromSecret(account.seedSecretName, transaction) @@ -944,7 +945,7 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res accountId: account.id, natsUserRuleId: currentRuleId }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: microservice.applicationId, ...options }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -954,11 +955,11 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res const operatorSeed = await _loadSeedFromSecret(operator.seedSecretName, transaction) const operatorKp = fromSeed(new TextEncoder().encode(operatorSeed)) await _reissueOneUserForRule(existingUser, userRule.id, operatorKp, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: microservice.applicationId, ...options }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: microservice.applicationId, ...options }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -1034,7 +1035,7 @@ async function reissueForAccountRule (accountRuleId, transaction) { await NatsAccountManager.update({ id: relayAccount.id }, { jwt: accountJwt }, transaction) } } - _triggerResolverArtifactsReconcile({ reason: 'account-rule-updated', accountRuleId }) + await _enqueueNatsReconcileOutbox({ reason: 'account-rule-updated', accountRuleId }, transaction) } /** @@ -1110,7 +1111,7 @@ async function reissueForUserRule (userRuleId, transaction) { await _reissueOneUserForRule(user, userRuleId, operatorKp, transaction) processedUserIds.add(user.id) } - _triggerResolverArtifactsReconcile({ reason: 'user-rule-updated', userRuleId }) + await _enqueueNatsReconcileOutbox({ reason: 'user-rule-updated', userRuleId }, transaction) } async function revokeMicroserviceUser (microserviceUuid, transaction) { @@ -1151,7 +1152,7 @@ async function revokeMicroserviceUser (microserviceUuid, transaction) { // best-effort secret cleanup } await NatsUserManager.delete({ id: user.id }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-created', applicationId: account.applicationId }) + await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: account.applicationId }, transaction) } async function deleteAccountForApplication (applicationId, transaction) { @@ -1176,7 +1177,7 @@ async function deleteAccountForApplication (applicationId, transaction) { // best-effort cleanup } await NatsAccountManager.delete({ id: account.id }, transaction) - _triggerResolverArtifactsReconcile({ reason: 'account-deleted', applicationId }) + await _enqueueNatsReconcileOutbox({ reason: 'account-deleted', applicationId }, transaction) } async function revokeUserByAccountAndName (accountId, userName, transaction) { @@ -1233,8 +1234,9 @@ async function revokeUserByAccountAndName (accountId, userName, transaction) { // best-effort secret cleanup } await NatsUserManager.delete({ id: user.id }, transaction) - _triggerResolverArtifactsReconcile( - account.applicationId != null ? { reason: 'account-created', applicationId: account.applicationId } : {} + await _enqueueNatsReconcileOutbox( + account.applicationId != null ? { reason: 'account-created', applicationId: account.applicationId } : {}, + transaction ) } @@ -1249,7 +1251,9 @@ function scheduleReissueForAccountRule (accountRuleId) { _runBackgroundTask(`reissue-account-rule-${accountRuleId}`, async () => { await module.exports.reissueForAccountRule(accountRuleId) }) - _triggerResolverArtifactsReconcile({ reason: 'account-rule-updated', accountRuleId }) + _enqueueNatsReconcileOutbox({ reason: 'account-rule-updated', accountRuleId }).catch((err) => { + logger.error(`NATS reconcile outbox enqueue failed: ${err.message}`) + }) return { scheduled: true } } @@ -1257,7 +1261,9 @@ function scheduleReissueForUserRule (userRuleId) { _runBackgroundTask(`reissue-user-rule-${userRuleId}`, async () => { await module.exports.reissueForUserRule(userRuleId) }) - _triggerResolverArtifactsReconcile({ reason: 'user-rule-updated', userRuleId }) + _enqueueNatsReconcileOutbox({ reason: 'user-rule-updated', userRuleId }).catch((err) => { + logger.error(`NATS reconcile outbox enqueue failed: ${err.message}`) + }) return { scheduled: true } } diff --git a/src/services/nats-service.js b/src/services/nats-service.js index c33d88dd..2147bee8 100644 --- a/src/services/nats-service.js +++ b/src/services/nats-service.js @@ -2,6 +2,7 @@ const fs = require('fs') const path = require('path') const crypto = require('crypto') const AppHelper = require('../helpers/app-helper') +const { isTest } = AppHelper const Errors = require('../helpers/errors') const ErrorMessages = require('../helpers/error-messages') const ConfigMapManager = require('../data/managers/config-map-manager') @@ -22,17 +23,24 @@ const NatsInstanceManager = require('../data/managers/nats-instance-manager') const NatsConnectionManager = require('../data/managers/nats-connection-manager') const NatsAccountManager = require('../data/managers/nats-account-manager') const NatsReconcileTaskManager = require('../data/managers/nats-reconcile-task-manager') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const NatsUserManager = require('../data/managers/nats-user-manager') const ApplicationManager = require('../data/managers/application-manager') const NatsAuthService = require('./nats-auth-service') const ChangeTrackingService = require('./change-tracking-service') const MicroservicesService = require('./microservices-service') const FogManager = require('../data/managers/iofog-manager') -const databaseProvider = require('../data/providers/database-factory') const config = require('../config') const Constants = require('../helpers/constants') const { ensureSystemApplication, getSystemMicroserviceName, slugifyName } = require('../helpers/system-naming') const TransactionDecorator = require('../decorators/transaction-decorator') +const { isSequelizeTransaction } = require('../helpers/sequelize-transaction') +const { + runInTransaction, + PRIORITY_INTERACTIVE, + PRIORITY_BACKGROUND, + getActiveTransaction +} = require('../helpers/transaction-runner') const { buildNatsServerCertificateHostList, buildNatsMqttCertificateHostList @@ -584,7 +592,7 @@ async function _computeClusterRoutesForInstance (natsInstance, transaction) { return routes } -async function _patchK8sHubConfigMapClusterRoutes (desiredControllerRoutes, transaction) { +async function _patchK8sHubConfigMapClusterRoutesExternal (desiredControllerRoutes) { const configMap = await K8sClient.getConfigMap(K8S_NATS_SERVER_CONFIG_MAP, { ignoreNotFound: true }) if (!configMap || !configMap.data) { logger.debug(`Hub ConfigMap ${K8S_NATS_SERVER_CONFIG_MAP} not found or empty (expected before operator creates it)`) @@ -614,6 +622,89 @@ async function _patchK8sHubConfigMapClusterRoutes (desiredControllerRoutes, tran await K8sClient.patchConfigMap(K8S_NATS_SERVER_CONFIG_MAP, { data: { [configKey]: newContent } }, { ignoreNotFound: true }) } +async function _patchK8sJwtBundleExternal (fullServerJwtBundle) { + const existing = await K8sClient.getConfigMap(K8S_NATS_JWT_BUNDLE_CONFIG_MAP, { ignoreNotFound: true }) + const existingData = existing && existing.data ? existing.data : null + const newHash = _configMapDataHash(fullServerJwtBundle) + const unchanged = existingData && _configMapDataHash(existingData) === newHash + if (!unchanged) { + await K8sClient.patchConfigMap(K8S_NATS_JWT_BUNDLE_CONFIG_MAP, { data: fullServerJwtBundle }, { ignoreNotFound: true }) + } +} + +async function _rolloutNatsStatefulSetExternal () { + await K8sClient.rolloutStatefulSet('nats') +} + +function _resolveParentTransaction (maybeTransaction, explicitlyPassed) { + if (explicitlyPassed && isSequelizeTransaction(maybeTransaction)) { + return maybeTransaction + } + const active = getActiveTransaction() + if (active) { + return active + } + if (isTest() && explicitlyPassed && maybeTransaction != null && typeof maybeTransaction === 'object') { + return maybeTransaction + } + return null +} + +function _scheduleK8sAfterCommit (transaction, fn) { + const run = () => Promise.resolve(fn()).catch((err) => { + logger.warn(`Deferred NATS K8s work failed: ${err.message}`) + }) + + if (transaction && typeof transaction.afterCommit === 'function') { + transaction.afterCommit(run) + return + } + + if (isTest()) { + return run() + } +} + +async function _applyEnsureNatsK8sExternal (k8sHubPatch) { + if (!k8sHubPatch) { + return + } + try { + await _patchK8sHubConfigMapClusterRoutesExternal(k8sHubPatch) + } catch (err) { + logger.warn(`Failed to patch Kubernetes NATS hub ConfigMap cluster routes: ${err.message}`) + } +} + +async function _applyCleanupNatsK8sExternal (k8sCleanup) { + if (!k8sCleanup) { + return + } + try { + await _patchK8sHubConfigMapClusterRoutesExternal(k8sCleanup.desiredControllerRoutes) + if (k8sCleanup.rollout) { + try { + await _rolloutNatsStatefulSetExternal() + } catch (rolloutErr) { + logger.warn(`Failed to rollout NATS StatefulSet after hub ConfigMap patch: ${rolloutErr.message}`) + } + } + } catch (err) { + logger.warn(`Failed to patch Kubernetes NATS hub ConfigMap cluster routes after cleanup: ${err.message}`) + } +} + +async function _applyK8sJwtBundleExternal (fullServerJwtBundle) { + if (!fullServerJwtBundle) { + return + } + try { + await _patchK8sJwtBundleExternal(fullServerJwtBundle) + } catch (err) { + logger.warn(`Failed to patch Kubernetes NATS hub JWT bundle ConfigMap: ${err.message}`) + } +} + function _clusterConfigRequiresRebuild (oldRoutes, newRoutes) { const oldLen = (oldRoutes && oldRoutes.length) || 0 const newLen = (newRoutes && newRoutes.length) || 0 @@ -889,7 +980,17 @@ async function _removeLeafOnlyArtifactsForFog (fog, microservice, transaction) { await NatsAuthService.deleteLeafSystemArtifactsForFog(fog, transaction) } -async function ensureNatsForFog (fog, natsConfig, transaction) { +async function ensureNatsForFogCertPrepDb (fog, natsConfig, transaction) { + const mode = (natsConfig && natsConfig.mode) || 'leaf' + if (mode === 'none') { + return null + } + const { serverCertName, mqttCertName } = await _ensureNatsCertificates(fog, transaction) + const jetstreamKey = await _ensureJetstreamKey(fog, transaction) + return { serverCertName, mqttCertName, jetstreamKey } +} + +async function _resolveNatsEnsureContext (fog, natsConfig, transaction) { const mode = (natsConfig && natsConfig.mode) || 'leaf' if (mode === 'none') { return null @@ -912,13 +1013,33 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { const mqttPort = (natsConfig && natsConfig.mqttPort) || DEFAULT_MQTT_PORT const httpPort = (natsConfig && natsConfig.httpPort) || DEFAULT_HTTP_PORT - const { serverCertName, mqttCertName } = await _ensureNatsCertificates(fog, transaction) - const configMapName = natsConfigMapName(fog) - const configKey = NATS_CONFIG_KEY - const template = !isLeaf ? readTemplate('server.conf') : readTemplate('leaf.conf') - const certName = serverCertName + return { + mode, + isHub, + isLeaf, + serverPort, + leafPort, + clusterPort, + mqttPort, + httpPort, + configMapName: natsConfigMapName(fog), + configKey: NATS_CONFIG_KEY, + template: !isLeaf ? readTemplate('server.conf') : readTemplate('leaf.conf'), + jwtBundleConfigMapName: isLeaf ? natsJwtBundleConfigMap(fog) : K8S_NATS_JWT_BUNDLE_CONFIG_MAP + } +} + +async function ensureNatsForFogAuthPrepDb (fog, natsConfig, prep, transaction) { + if (!prep) { + return null + } + const ctx = await _resolveNatsEnsureContext(fog, natsConfig, transaction) + if (!ctx) { + return null + } + + const { isHub, isLeaf, jwtBundleConfigMapName } = ctx - const jwtBundleConfigMapName = isLeaf ? natsJwtBundleConfigMap(fog) : K8S_NATS_JWT_BUNDLE_CONFIG_MAP if (isLeaf) { const jwtBundle = await _buildJwtBundle(fog, true, transaction) await _ensureConfigMap(natsJwtBundleConfigMap(fog), jwtBundle, transaction) @@ -927,11 +1048,44 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { await _ensureConfigMap(K8S_NATS_JWT_BUNDLE_CONFIG_MAP, fullJwtBundle, transaction) } + let sysCredsSecretName = null + if (isHub) { + const { user: hubSysUser } = await NatsAuthService.ensureSysUserForServer({ isHub: true }, transaction) + sysCredsSecretName = hubSysUser.credsSecretName + } else if (!isLeaf) { + const { user: serverSysUser } = await NatsAuthService.ensureSysUserForServer({ isHub: false, fog }, transaction) + sysCredsSecretName = serverSysUser.credsSecretName + } + + return { ...ctx, sysCredsSecretName, jwtBundleConfigMapName } +} + +async function ensureNatsForFogTopologyDb (fog, natsConfig, prep, authCtx, transaction) { + if (!prep || !authCtx) { + return null + } + + const { serverCertName, mqttCertName, jetstreamKey } = prep + const { + mode, + isHub, + isLeaf, + serverPort, + leafPort, + clusterPort, + mqttPort, + httpPort, + configMapName, + configKey, + template, + jwtBundleConfigMapName, + sysCredsSecretName + } = authCtx + const certName = serverCertName + const microserviceResult = await _ensureNatsMicroservice(fog, mode, transaction) const microservice = microserviceResult let anyVolumeMappingCreated = !!(microservice && microservice._volumeMappingCreated) - - const jetstreamKey = await _ensureJetstreamKey(fog, transaction) const sysAccountName = isLeaf ? NatsAuthService.leafSystemAccountName(fog) : NatsAuthService.SYSTEM_ACCOUNT_NAME const sysUserName = isLeaf ? NatsAuthService.leafSystemAccountUserName(fog) : NatsAuthService.sysUserNameForServer(isHub, fog) const sysCredPath = `${NATS_CREDS_DIR}/${slugifyName(sysAccountName)}/${slugifyName(sysUserName)}.creds` @@ -1050,13 +1204,9 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { transaction ) + let k8sHubPatch = null if (_isKubernetesControlPlane() && !savedInstance.isLeaf && savedInstance.host) { - try { - const desiredControllerRoutes = await _getControllerManagedClusterRoutes(transaction) - await _patchK8sHubConfigMapClusterRoutes(desiredControllerRoutes, transaction) - } catch (err) { - logger.warn(`Failed to patch Kubernetes NATS hub ConfigMap cluster routes: ${err.message}`) - } + k8sHubPatch = await _getControllerManagedClusterRoutes(transaction) } if (!savedInstance.isLeaf) { @@ -1072,7 +1222,7 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { const otherInstances = (allServerInstancesNow || []).filter(i => i.id !== savedInstance.id) if (otherInstances.length > 0) { const otherFogUuids = otherInstances.map((i) => i.iofogUuid).filter(Boolean) - await enqueueReconcileTask({ reason: 'cluster-routes-changed', fogUuids: otherFogUuids }, transaction) + await ReconcileOutboxManager.enqueueNats({ reason: 'cluster-routes-changed', fogUuids: otherFogUuids }, transaction) } } @@ -1091,28 +1241,20 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { anyVolumeMappingCreated = (await _ensureVolumeMapping(microservice.uuid, certName, `${NATS_CERTS_DIR}/${certName}`, 'ro', 'volumeMount', transaction)) || anyVolumeMappingCreated anyVolumeMappingCreated = (await _ensureVolumeMapping(microservice.uuid, mqttCertName, `${NATS_CERTS_DIR}/${mqttCertName}`, 'ro', 'volumeMount', transaction)) || anyVolumeMappingCreated - if (isHub) { - const { user: hubSysUser } = await NatsAuthService.ensureSysUserForServer({ isHub: true }, transaction) - const credsSecretName = hubSysUser.credsSecretName - await _ensureVolumeMount(credsSecretName, { secretName: credsSecretName }, transaction) - await VolumeMountService.linkVolumeMountEndpoint(credsSecretName, [fog.uuid], transaction) - anyVolumeMappingCreated = (await _ensureVolumeMapping(microservice.uuid, credsSecretName, NATS_CREDS_DIR, 'ro', 'volumeMount', transaction)) || anyVolumeMappingCreated - } else if (!savedInstance.isLeaf) { - const { user: serverSysUser } = await NatsAuthService.ensureSysUserForServer({ isHub: false, fog }, transaction) - const credsSecretName = serverSysUser.credsSecretName - await _ensureVolumeMount(credsSecretName, { secretName: credsSecretName }, transaction) - await VolumeMountService.linkVolumeMountEndpoint(credsSecretName, [fog.uuid], transaction) - anyVolumeMappingCreated = (await _ensureVolumeMapping(microservice.uuid, credsSecretName, NATS_CREDS_DIR, 'ro', 'volumeMount', transaction)) || anyVolumeMappingCreated - } else { - const sysCredsSecretName = await _getSysUserCredsSecretNameForFog(fog, false, transaction) - if (sysCredsSecretName) { + if (sysCredsSecretName) { + await _ensureVolumeMount(sysCredsSecretName, { secretName: sysCredsSecretName }, transaction) + await VolumeMountService.linkVolumeMountEndpoint(sysCredsSecretName, [fog.uuid], transaction) + anyVolumeMappingCreated = (await _ensureVolumeMapping(microservice.uuid, sysCredsSecretName, NATS_CREDS_DIR, 'ro', 'volumeMount', transaction)) || anyVolumeMappingCreated + } else if (isLeaf) { + const leafSysCredsSecretName = await _getSysUserCredsSecretNameForFog(fog, false, transaction) + if (leafSysCredsSecretName) { await VolumeMappingManager.delete({ microserviceUuid: microservice.uuid, - hostDestination: sysCredsSecretName, + hostDestination: leafSysCredsSecretName, type: 'volumeMount' }, transaction) try { - await VolumeMountService.unlinkVolumeMountEndpoint(sysCredsSecretName, [fog.uuid], transaction) + await VolumeMountService.unlinkVolumeMountEndpoint(leafSysCredsSecretName, [fog.uuid], transaction) } catch (err) { if (err.name !== 'NotFoundError') { throw err @@ -1130,10 +1272,69 @@ async function ensureNatsForFog (fog, natsConfig, transaction) { } await ChangeTrackingService.update(fog.uuid, ChangeTrackingService.events.microserviceList, transaction) - return microservice + return { microservice, k8sHubPatch } +} + +/** @deprecated Use phased ensure; kept for grep gates and direct unit tests */ +async function ensureNatsForFogDbMutation (fog, natsConfig, prep, transaction) { + const authCtx = await ensureNatsForFogAuthPrepDb(fog, natsConfig, prep, transaction) + if (!authCtx) { + return null + } + return ensureNatsForFogTopologyDb(fog, natsConfig, prep, authCtx, transaction) +} + +/** @deprecated Use phased ensure; kept for grep gates and direct unit tests */ +async function ensureNatsForFogDb (fog, natsConfig, transaction) { + const prep = await ensureNatsForFogCertPrepDb(fog, natsConfig, transaction) + if (!prep) { + return null + } + return ensureNatsForFogDbMutation(fog, natsConfig, prep, transaction) } -async function cleanupNatsForFog (fog, transaction) { +async function _ensureNatsForFogPhased (fog, natsConfig, { priority = PRIORITY_INTERACTIVE } = {}) { + const mode = (natsConfig && natsConfig.mode) || 'leaf' + if (mode === 'none') { + return null + } + logger.info(`NATS ensure certPrep starting for fog ${fog.uuid}`) + const prep = await runInTransaction( + (transaction) => ensureNatsForFogCertPrepDb(fog, natsConfig, transaction), + { priority, label: 'nats.ensure.certPrep' } + ) + if (!prep) { + return null + } + logger.info(`NATS ensure authPrep starting for fog ${fog.uuid}`) + const authCtx = await runInTransaction( + (transaction) => ensureNatsForFogAuthPrepDb(fog, natsConfig, prep, transaction), + { priority, label: 'nats.ensure.authPrep' } + ) + if (!authCtx) { + return null + } + logger.info(`NATS ensure topology starting for fog ${fog.uuid}`) + const result = await runInTransaction( + (transaction) => ensureNatsForFogTopologyDb(fog, natsConfig, prep, authCtx, transaction), + { priority, label: 'nats.ensure.topology' } + ) + await _applyEnsureNatsK8sExternal(result && result.k8sHubPatch) + return result && result.microservice +} + +async function ensureNatsForFogPhased (fog, natsConfig) { + return _ensureNatsForFogPhased(fog, natsConfig, { priority: PRIORITY_BACKGROUND }) +} + +async function ensureNatsForFog (...args) { + const fog = args[0] + const natsConfig = args[1] + // Parent transaction arg ignored — phased short txs (Plan 19-I-B / R131) + return _ensureNatsForFogPhased(fog, natsConfig, { priority: PRIORITY_INTERACTIVE }) +} + +async function cleanupNatsForFogDb (fog, transaction) { const natsInstance = await NatsInstanceManager.findByFog(fog.uuid, transaction) const mountNames = [ natsConfigMapName(fog), @@ -1181,28 +1382,22 @@ async function cleanupNatsForFog (fog, transaction) { const wasLeaf = !!(natsInstance && natsInstance.isLeaf) const wasServer = !!(natsInstance && !natsInstance.isLeaf) const wasHub = !!(natsInstance && natsInstance.isHub) + let k8sCleanup = null if (natsInstance) { await NatsConnectionManager.delete({ sourceNats: natsInstance.id }, transaction) await NatsConnectionManager.delete({ destNats: natsInstance.id }, transaction) await NatsInstanceManager.delete({ id: natsInstance.id }, transaction) if (_isKubernetesControlPlane() && !natsInstance.isLeaf) { - try { - const desiredControllerRoutes = await _getControllerManagedClusterRoutes(transaction) - await _patchK8sHubConfigMapClusterRoutes(desiredControllerRoutes, transaction) - try { - await K8sClient.rolloutStatefulSet('nats') - } catch (rolloutErr) { - logger.warn(`Failed to rollout NATS StatefulSet after hub ConfigMap patch: ${rolloutErr.message}`) - } - } catch (err) { - logger.warn(`Failed to patch Kubernetes NATS hub ConfigMap cluster routes after cleanup: ${err.message}`) + k8sCleanup = { + desiredControllerRoutes: await _getControllerManagedClusterRoutes(transaction), + rollout: true } } if (!natsInstance.isLeaf) { const remainingServers = await NatsInstanceManager.findAll({ isLeaf: false }, transaction) if (remainingServers && remainingServers.length > 0) { const remainingFogUuids = remainingServers.map((s) => s.iofogUuid).filter(Boolean) - await enqueueReconcileTask({ reason: 'server-deleted', fogUuids: remainingFogUuids }, transaction) + await ReconcileOutboxManager.enqueueNats({ reason: 'server-deleted', fogUuids: remainingFogUuids }, transaction) } } } @@ -1265,6 +1460,32 @@ async function cleanupNatsForFog (fog, transaction) { if (wasServer) { await NatsAuthService.deleteServerSysUserForFog(fog, wasHub, transaction) } + + return { k8sCleanup } +} + +async function _cleanupNatsForFogPhased (fog, { priority = PRIORITY_INTERACTIVE } = {}) { + const result = await runInTransaction( + (transaction) => cleanupNatsForFogDb(fog, transaction), + { priority, label: 'nats.cleanupForFog' } + ) + await _applyCleanupNatsK8sExternal(result.k8sCleanup) +} + +async function cleanupNatsForFogPhased (fog) { + return _cleanupNatsForFogPhased(fog, { priority: PRIORITY_BACKGROUND }) +} + +async function cleanupNatsForFog (...args) { + const fog = args[0] + const parentTx = _resolveParentTransaction(args[1], args.length > 1) + if (parentTx) { + const result = await cleanupNatsForFogDb(fog, parentTx) + _scheduleK8sAfterCommit(parentTx, () => _applyCleanupNatsK8sExternal(result && result.k8sCleanup)) + return result + } + // No parent transaction — short tx + post-tx K8s (Plan 19-I-B / R131) + return _cleanupNatsForFogPhased(fog, { priority: PRIORITY_INTERACTIVE }) } function _getAffectedFogUuidsForApplication (applicationId, natsInstanceByFog, microservicesByFog) { @@ -1315,7 +1536,7 @@ async function _getAffectedFogUuidsForUserRule (userRuleId, natsInstanceByFog, t return out } -async function _reconcileResolverArtifactsOnce (options = {}, transaction) { +async function _reconcileResolverArtifactsOnceDb (options = {}, transaction) { const NatsAuthServiceRuntime = require('./nats-auth-service') const fogs = await FogManager.findAll({}, transaction) @@ -1474,18 +1695,8 @@ async function _reconcileResolverArtifactsOnce (options = {}, transaction) { } } - if (_isKubernetesControlPlane()) { - try { - const existing = await K8sClient.getConfigMap(K8S_NATS_JWT_BUNDLE_CONFIG_MAP, { ignoreNotFound: true }) - const existingData = existing && existing.data ? existing.data : null - const newHash = _configMapDataHash(fullServerJwtBundle) - const unchanged = existingData && _configMapDataHash(existingData) === newHash - if (!unchanged) { - await K8sClient.patchConfigMap(K8S_NATS_JWT_BUNDLE_CONFIG_MAP, { data: fullServerJwtBundle }, { ignoreNotFound: true }) - } - } catch (err) { - logger.warn(`Failed to patch Kubernetes NATS hub JWT bundle ConfigMap: ${err.message}`) - } + return { + fullServerJwtBundle: _isKubernetesControlPlane() ? fullServerJwtBundle : null } } @@ -1550,9 +1761,6 @@ function _chunkFogUuids (fogUuids, chunkSize) { } async function enqueueReconcileTask (options = {}, transaction) { - if (transaction.fakeTransaction) { - return databaseProvider.sequelize.transaction((t) => enqueueReconcileTask(options, t)) - } const reason = REASON_VALUES.includes(options.reason) ? options.reason : 'auth-mutation' const applicationId = options.applicationId != null ? options.applicationId : null const accountRuleId = options.accountRuleId != null ? options.accountRuleId : null @@ -1596,18 +1804,43 @@ async function claimNextTask (controllerUuid, stalenessSeconds) { return NatsReconcileTaskManager.claimNext(controllerUuid, stalenessSeconds) } -async function reconcileResolverArtifacts (options = {}, transaction) { +async function _reconcileResolverArtifactsDbLoop (options, transaction) { + let fullServerJwtBundle = null + do { + natsReconcilePending = false + const result = await _reconcileResolverArtifactsOnceDb(options, transaction) + if (result && result.fullServerJwtBundle) { + fullServerJwtBundle = result.fullServerJwtBundle + } + } while (natsReconcilePending) + return { fullServerJwtBundle } +} + +async function reconcileResolverArtifacts (...args) { if (natsReconcileRunning) { natsReconcilePending = true return { scheduled: true } } + const options = args[0] || {} + const maybeTransaction = args.length > 1 ? args[args.length - 1] : undefined + const parentTx = _resolveParentTransaction(maybeTransaction, args.length > 1) + natsReconcileRunning = true try { - do { - natsReconcilePending = false - await _reconcileResolverArtifactsOnce(options, transaction) - } while (natsReconcilePending) + if (parentTx) { + const result = await _reconcileResolverArtifactsDbLoop(options, parentTx) + if (result.fullServerJwtBundle) { + _scheduleK8sAfterCommit(parentTx, () => _applyK8sJwtBundleExternal(result.fullServerJwtBundle)) + } + return { scheduled: false } + } + + const result = await runInTransaction( + (transaction) => _reconcileResolverArtifactsDbLoop(options, transaction), + { priority: PRIORITY_BACKGROUND, label: 'nats.reconcileResolverArtifacts' } + ) + await _applyK8sJwtBundleExternal(result.fullServerJwtBundle) return { scheduled: false } } finally { natsReconcileRunning = false @@ -1643,17 +1876,30 @@ function scheduleResolverArtifactsReconcile (options = {}) { return { scheduled: true } } +function scheduleEnsureNatsK8sAfterCommit (transaction, k8sHubPatch) { + _scheduleK8sAfterCommit(transaction, () => _applyEnsureNatsK8sExternal(k8sHubPatch)) +} + function normalizeJetstreamSize (value, defaultValue) { return _normalizeJetstreamSize(value, defaultValue) } module.exports = { - ensureNatsForFog: TransactionDecorator.generateTransaction(ensureNatsForFog), - reconcileResolverArtifacts: TransactionDecorator.generateTransaction(reconcileResolverArtifacts), + ensureNatsForFog, + ensureNatsForFogPhased, + ensureNatsForFogDb, + ensureNatsForFogCertPrepDb, + ensureNatsForFogAuthPrepDb, + ensureNatsForFogTopologyDb, + ensureNatsForFogDbMutation, + scheduleEnsureNatsK8sAfterCommit, + reconcileResolverArtifacts, scheduleResolverArtifactsReconcile, enqueueReconcileTask: TransactionDecorator.generateTransaction(enqueueReconcileTask), claimNextTask, - cleanupNatsForFog: TransactionDecorator.generateTransaction(cleanupNatsForFog), + cleanupNatsForFog, + cleanupNatsForFogPhased, + cleanupNatsForFogDb, ensureLeafCredsForFog: TransactionDecorator.generateTransaction(ensureLeafCredsForFog), isReconcileRunning, setReconcilePending, diff --git a/src/services/registry-service.js b/src/services/registry-service.js index 41985fdc..8a20f281 100644 --- a/src/services/registry-service.js +++ b/src/services/registry-service.js @@ -1,5 +1,9 @@ const RegistryManager = require('../data/managers/registry-manager') const SecretHelper = require('../helpers/secret-helper') +const { + scheduleVaultDeleteAfterCommit, + scheduleVaultPromoteAfterCommit +} = require('../helpers/vault-transaction-helper') const Validator = require('../schemas') const Errors = require('../helpers/errors') const ErrorMessages = require('../helpers/error-messages') @@ -31,16 +35,22 @@ const createRegistry = async function (registry, transaction) { const createdRegistry = await RegistryManager.create(registryCreate, transaction) if (!isPasswordEmpty(registryCreate.password)) { - const encryptedPassword = await SecretHelper.encryptSecret( - { value: registryCreate.password }, - 'registry-' + createdRegistry.id, - 'registry' - ) + const secretName = 'registry-' + createdRegistry.id + const secretData = { value: registryCreate.password } + const internalEncrypted = await SecretHelper.encryptSecretInternal(secretData, secretName) await RegistryManager.update( { id: createdRegistry.id }, - { password: encryptedPassword }, + { password: internalEncrypted }, transaction ) + scheduleVaultPromoteAfterCommit(transaction, { + secretData, + secretName, + secretType: 'registry', + model: () => require('../data/models').Registry, + where: { id: createdRegistry.id }, + field: 'password' + }) } await _updateChangeTracking(transaction) @@ -110,7 +120,7 @@ const updateRegistry = async function (registry, registryId, isCLI, transaction) registryUpdate = AppHelper.deleteUndefinedFields(registryUpdate) if (registryUpdate.password !== undefined && isPasswordEmpty(registryUpdate.password) && SecretHelper.isVaultReference(existingRegistry.password)) { - await SecretHelper.deleteSecret('registry-' + existingRegistry.id, 'registry') + scheduleVaultDeleteAfterCommit(transaction, 'registry-' + existingRegistry.id, 'registry') } const where = isCLI diff --git a/src/services/router-service.js b/src/services/router-service.js index 7663eb45..8241dc1e 100644 --- a/src/services/router-service.js +++ b/src/services/router-service.js @@ -251,6 +251,11 @@ async function updateConfig (routerID, containerEngine, transaction) { newConfig.connectors[connectorConfig.name] = connectorConfig } + // Service platform owns bridges.tcpConnectors/tcpListeners; fog recompute rebuilds listeners. + if (currentConfig.bridges) { + newConfig.bridges = JSON.parse(JSON.stringify(currentConfig.bridges)) + } + await _ensureRouterTlsVolumeMountsAndMappings(router.iofogUuid, routerMicroservice.uuid, transaction, true) await ChangeTrackingService.update(router.iofogUuid, ChangeTrackingService.events.microserviceConfig, transaction) diff --git a/src/services/secret-service.js b/src/services/secret-service.js index 59500b20..e1b73c8e 100644 --- a/src/services/secret-service.js +++ b/src/services/secret-service.js @@ -11,8 +11,7 @@ const Validator = require('../schemas/index') const VolumeMountService = require('./volume-mount-service') const VolumeMountingManager = require('../data/managers/volume-mounting-manager') const CertificateManager = require('../data/managers/certificate-manager') -const SecretHelper = require('../helpers/secret-helper') -const vaultManager = require('../vault/vault-manager') +const { scheduleVaultDeleteAfterCommit } = require('../helpers/vault-transaction-helper') function validateBase64 (value) { try { @@ -186,37 +185,22 @@ async function deleteSecretEndpoint (secretName, transaction) { } await CertificateManager.deleteCertificate(certificate.name, transaction) await SecretManager.deleteSecret(secretName, transaction) - // Remove secret from external vault if configured - if (vaultManager.isEnabled()) { - await SecretHelper.deleteSecret(secretName, existingSecret.type) - } await _deleteVolumeMountsUsingSecret(secretName, transaction) } else { await CertificateManager.deleteCertificate(certificate.name, transaction) await _deleteVolumeMountsUsingSecret(secretName, transaction) await SecretManager.deleteSecret(secretName, transaction) - // Remove secret from external vault if configured - if (vaultManager.isEnabled()) { - await SecretHelper.deleteSecret(secretName, existingSecret.type) - } } } else { - // Delete secret from database and external vault await SecretManager.deleteSecret(secretName, transaction) await _deleteVolumeMountsUsingSecret(secretName, transaction) - // Remove secret from external vault if configured - if (vaultManager.isEnabled()) { - await SecretHelper.deleteSecret(secretName, existingSecret.type) - } } } else { await SecretManager.deleteSecret(secretName, transaction) await _deleteVolumeMountsUsingSecret(secretName, transaction) - // Remove secret from external vault if configured - if (vaultManager.isEnabled()) { - await SecretHelper.deleteSecret(secretName, existingSecret.type) - } } + + scheduleVaultDeleteAfterCommit(transaction, secretName, existingSecret.type) return {} } diff --git a/src/services/service-bridge-config.js b/src/services/service-bridge-config.js index e9c50e3d..07daf7e1 100644 --- a/src/services/service-bridge-config.js +++ b/src/services/service-bridge-config.js @@ -11,7 +11,7 @@ const ErrorMessages = require('../helpers/error-messages') const AppHelper = require('../helpers/app-helper') function isServiceDerivedBridgeKey (name) { - return typeof name === 'string' && (name.endsWith('-listener') || name.endsWith('-connector')) + return typeof name === 'string' && name.endsWith('-listener') } function stripServiceDerivedBridges (config) { @@ -27,13 +27,6 @@ function stripServiceDerivedBridges (config) { } } } - if (result.bridges.tcpConnectors) { - for (const key of Object.keys(result.bridges.tcpConnectors)) { - if (isServiceDerivedBridgeKey(key)) { - delete result.bridges.tcpConnectors[key] - } - } - } return result } diff --git a/src/services/service-platform-service.js b/src/services/service-platform-service.js index 14d97bfe..9cda3cf9 100644 --- a/src/services/service-platform-service.js +++ b/src/services/service-platform-service.js @@ -1,10 +1,9 @@ -const TransactionDecorator = require('../decorators/transaction-decorator') const config = require('../config') const Errors = require('../helpers/errors') const ErrorMessages = require('../helpers/error-messages') const AppHelper = require('../helpers/app-helper') const ServiceManager = require('../data/managers/service-manager') -const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') const HubRouterConfigLockManager = require('../data/managers/hub-router-config-lock-manager') const RouterManager = require('../data/managers/router-manager') @@ -13,6 +12,7 @@ const FogManager = require('../data/managers/iofog-manager') const ChangeTrackingService = require('./change-tracking-service') const ServicesService = require('./services-service') const K8sClient = require('../utils/k8s-client') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const { ensureSystemApplication, getSystemMicroserviceName @@ -80,10 +80,6 @@ async function _updateRouterMicroserviceConfig (fogNodeUuid, routerConfig, trans } async function _patchK8sRouterConfig (routerConfig) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } await K8sClient.patchConfigMap(K8S_ROUTER_CONFIG_MAP, { data: { 'skrouterd.json': JSON.stringify(routerConfig) @@ -102,28 +98,86 @@ async function _resolveHubListenerFogUuid (serviceConfig, transaction) { return serviceConfig.defaultBridge } -async function upsertHubTcpListener (serviceConfig, transaction) { - const isK8s = await ServicesService.checkKubernetesEnvironment() - const listener = ServicesService._buildTcpListener(serviceConfig) +function emptyK8sHubRouterPlan () { + return { + upsertListeners: [], + upsertConnectors: [], + deleteListenerNames: [], + deleteConnectorNames: [] + } +} - if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) +function mergeK8sHubRouterPlans (...plans) { + const merged = emptyK8sHubRouterPlan() + for (const plan of plans) { + if (!plan) { + continue + } + merged.upsertListeners.push(...plan.upsertListeners) + merged.upsertConnectors.push(...plan.upsertConnectors) + merged.deleteListenerNames.push(...plan.deleteListenerNames) + merged.deleteConnectorNames.push(...plan.deleteConnectorNames) + } + return merged +} + +function applyK8sHubRouterPlanToConfig (routerConfig, plan) { + let updatedConfig = routerConfig + + for (const connectorName of plan.deleteConnectorNames) { + updatedConfig = updatedConfig.filter((item) => + !(item[0] === 'tcpConnector' && item[1].name === connectorName) + ) + } + for (const listenerName of plan.deleteListenerNames) { + updatedConfig = updatedConfig.filter((item) => + !(item[0] === 'tcpListener' && item[1].name === listenerName) + ) + } + for (const connector of plan.upsertConnectors) { + const connectorIndex = updatedConfig.findIndex((item) => + item[0] === 'tcpConnector' && item[1].name === connector.name + ) + if (connectorIndex !== -1) { + updatedConfig[connectorIndex] = ['tcpConnector', connector] + } else { + updatedConfig.push(['tcpConnector', connector]) } - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - const listenerIndex = routerConfig.findIndex((item) => + } + for (const listener of plan.upsertListeners) { + const listenerIndex = updatedConfig.findIndex((item) => item[0] === 'tcpListener' && item[1].name === listener.name ) if (listenerIndex !== -1) { - routerConfig[listenerIndex] = ['tcpListener', listener] + updatedConfig[listenerIndex] = ['tcpListener', listener] } else { - routerConfig.push(['tcpListener', listener]) + updatedConfig.push(['tcpListener', listener]) } - await _patchK8sRouterConfig(routerConfig) + } + + return updatedConfig +} + +async function applyK8sHubRouterPlan (plan) { + const hasChanges = plan.upsertListeners.length > 0 || + plan.upsertConnectors.length > 0 || + plan.deleteListenerNames.length > 0 || + plan.deleteConnectorNames.length > 0 + if (!hasChanges) { return } + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + const updatedConfig = applyK8sHubRouterPlanToConfig(routerConfig, plan) + await _patchK8sRouterConfig(updatedConfig) +} + +async function upsertHubTcpListenerDb (serviceConfig, transaction) { + const listener = ServicesService._buildTcpListener(serviceConfig) const fogNodeUuid = await _resolveHubListenerFogUuid(serviceConfig, transaction) const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) const currentConfig = JSON.parse(routerMicroservice.config || '{}') @@ -137,30 +191,11 @@ async function upsertHubTcpListener (serviceConfig, transaction) { await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) } -async function upsertHubTcpConnector (serviceConfig, transaction) { - const isK8s = await ServicesService.checkKubernetesEnvironment() +async function upsertHubTcpConnectorDb (serviceConfig, transaction) { const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) const connector = await ServicesService._buildTcpConnector(serviceConfig, transaction) if (targetRouterNode === 'default-router') { - if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - const connectorIndex = routerConfig.findIndex((item) => - item[0] === 'tcpConnector' && item[1].name === connector.name - ) - if (connectorIndex !== -1) { - routerConfig[connectorIndex] = ['tcpConnector', connector] - } else { - routerConfig.push(['tcpConnector', connector]) - } - await _patchK8sRouterConfig(routerConfig) - return - } - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) if (!defaultRouter) { throw new Errors.NotFoundError('Default router not found') @@ -192,25 +227,11 @@ async function upsertHubTcpConnector (serviceConfig, transaction) { await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) } -async function deleteHubTcpConnector (serviceConfig, transaction) { - const isK8s = await ServicesService.checkKubernetesEnvironment() +async function deleteHubTcpConnectorDb (serviceConfig, transaction) { const connectorName = `${serviceConfig.name}-connector` const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) if (targetRouterNode === 'default-router') { - if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - const updatedConfig = routerConfig.filter((item) => - !(item[0] === 'tcpConnector' && item[1].name === connectorName) - ) - await _patchK8sRouterConfig(updatedConfig) - return - } - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) if (!defaultRouter) { throw new Errors.NotFoundError('Default router not found') @@ -234,23 +255,8 @@ async function deleteHubTcpConnector (serviceConfig, transaction) { await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) } -async function deleteHubTcpListener (serviceConfig, transaction) { - const isK8s = await ServicesService.checkKubernetesEnvironment() +async function deleteHubTcpListenerDb (serviceConfig, transaction) { const listenerName = `${serviceConfig.name}-listener` - - if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - const updatedConfig = routerConfig.filter((item) => - !(item[0] === 'tcpListener' && item[1].name === listenerName) - ) - await _patchK8sRouterConfig(updatedConfig) - return - } - const fogNodeUuid = await _resolveHubListenerFogUuid(serviceConfig, transaction) const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) const currentConfig = JSON.parse(routerMicroservice.config || '{}') @@ -260,15 +266,111 @@ async function deleteHubTcpListener (serviceConfig, transaction) { await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) } -async function acquireHubLockWithTimeout (controllerUuid, transaction) { +async function planHubTcpConnectorUpsert (serviceConfig, transaction) { + const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) + const connector = await ServicesService._buildTcpConnector(serviceConfig, transaction) + + if (targetRouterNode === 'default-router') { + return { + ...emptyK8sHubRouterPlan(), + upsertConnectors: [connector] + } + } + + await upsertHubTcpConnectorDb(serviceConfig, transaction) + return emptyK8sHubRouterPlan() +} + +async function planHubTcpConnectorDelete (serviceConfig, transaction) { + const connectorName = `${serviceConfig.name}-connector` + const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) + + if (targetRouterNode === 'default-router') { + return { + ...emptyK8sHubRouterPlan(), + deleteConnectorNames: [connectorName] + } + } + + await deleteHubTcpConnectorDb(serviceConfig, transaction) + return emptyK8sHubRouterPlan() +} + +async function planHubTcpListenerUpsert (serviceConfig) { + const listener = ServicesService._buildTcpListener(serviceConfig) + return { + ...emptyK8sHubRouterPlan(), + upsertListeners: [listener] + } +} + +async function planHubTcpListenerDelete (serviceConfig) { + return { + ...emptyK8sHubRouterPlan(), + deleteListenerNames: [`${serviceConfig.name}-listener`] + } +} + +async function upsertHubTcpListener (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + const plan = await planHubTcpListenerUpsert(serviceConfig) + await applyK8sHubRouterPlan(plan) + return + } + + await upsertHubTcpListenerDb(serviceConfig, transaction) +} + +async function upsertHubTcpConnector (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + const plan = await planHubTcpConnectorUpsert(serviceConfig, transaction) + await applyK8sHubRouterPlan(plan) + return + } + + await upsertHubTcpConnectorDb(serviceConfig, transaction) +} + +async function deleteHubTcpConnector (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + const plan = await planHubTcpConnectorDelete(serviceConfig, transaction) + await applyK8sHubRouterPlan(plan) + return + } + + await deleteHubTcpConnectorDb(serviceConfig, transaction) +} + +async function deleteHubTcpListener (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + const plan = await planHubTcpListenerDelete(serviceConfig) + await applyK8sHubRouterPlan(plan) + return + } + + await deleteHubTcpListenerDb(serviceConfig, transaction) +} + +async function acquireHubLockWithTimeout (controllerUuid) { const timeoutSeconds = config.get('settings.hubRouterConfigLockTimeoutSeconds', 120) const deadline = Date.now() + timeoutSeconds * 1000 while (Date.now() < deadline) { - const acquired = await HubRouterConfigLockManager.tryAcquire( - controllerUuid, - timeoutSeconds, - transaction + const acquired = await runInTransaction( + (transaction) => HubRouterConfigLockManager.tryAcquire( + controllerUuid, + timeoutSeconds, + transaction + ), + { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.hubLockAcquire' } ) if (acquired) { return true @@ -279,6 +381,13 @@ async function acquireHubLockWithTimeout (controllerUuid, transaction) { throw new Error(`Timed out waiting for hub router ConfigMap lock after ${timeoutSeconds}s`) } +async function releaseHubLock (controllerUuid) { + await runInTransaction( + (transaction) => HubRouterConfigLockManager.release(controllerUuid, transaction), + { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.hubLockRelease' } + ) +} + async function watchLoadBalancerWithTimeout (serviceName) { const timeoutSeconds = config.get('settings.serviceLoadBalancerWatchTimeoutSeconds', 300) const retryInterval = 2000 @@ -302,19 +411,28 @@ function needsK8sService (serviceConfig, isK8s) { serviceType === 'external' } -async function reconcileK8sService (serviceConfig, isK8s, transaction) { +async function reconcileK8sServiceExternal (serviceConfig, isK8s) { if (!needsK8sService(serviceConfig, isK8s)) { return } - await ServicesService._updateK8sService(serviceConfig, transaction) + const loadBalancerIP = await ServicesService._syncK8sServiceResource(serviceConfig) - if ((serviceConfig.k8sType || '').toLowerCase() === 'loadbalancer') { - const loadBalancerIP = await watchLoadBalancerWithTimeout(serviceConfig.name) - await ServiceManager.update( - { name: serviceConfig.name }, - { serviceEndpoint: loadBalancerIP }, - transaction + if ((serviceConfig.k8sType || '').toLowerCase() === 'loadbalancer' && !loadBalancerIP) { + const timeoutSeconds = config.get('settings.serviceLoadBalancerWatchTimeoutSeconds', 300) + throw new Error( + `LoadBalancer IP not assigned for service ${serviceConfig.name} within ${timeoutSeconds}s` + ) + } + + if (loadBalancerIP) { + await runInTransaction( + (transaction) => ServiceManager.update( + { name: serviceConfig.name }, + { serviceEndpoint: loadBalancerIP }, + transaction + ), + { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.k8sLoadBalancerEndpoint' } ) } } @@ -322,7 +440,7 @@ async function reconcileK8sService (serviceConfig, isK8s, transaction) { async function fanOutFogReconcile (serviceTags, transaction) { const fogUuids = await ServicesService.handleServiceDistribution(serviceTags, transaction) for (const fogUuid of fogUuids) { - await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + await ReconcileOutboxManager.enqueueFogPlatform({ fogUuid, reason: 'service-changed' }, transaction) @@ -331,34 +449,53 @@ async function fanOutFogReconcile (serviceTags, transaction) { } async function reconcileServiceHub (serviceConfig, snapshot, transaction) { + const plans = [] + if (snapshot && snapshot.resource != null && serviceConfig.resource != null && snapshot.resource !== serviceConfig.resource) { - await deleteHubTcpConnector(buildServiceConfigFromRow(snapshot), transaction) + plans.push(await planHubTcpConnectorDelete(buildServiceConfigFromRow(snapshot), transaction)) } - await upsertHubTcpConnector(serviceConfig, transaction) - await upsertHubTcpListener(serviceConfig, transaction) + plans.push(await planHubTcpConnectorUpsert(serviceConfig, transaction)) + plans.push(await planHubTcpListenerUpsert(serviceConfig)) + + return mergeK8sHubRouterPlans(...plans) } -async function reconcileServiceDeleteHub (serviceConfig, isK8s, transaction) { - await deleteHubTcpConnector(serviceConfig, transaction) - await deleteHubTcpListener(serviceConfig, transaction) +async function reconcileServiceDeleteHub (serviceConfig, transaction) { + const plans = [ + await planHubTcpConnectorDelete(serviceConfig, transaction), + await planHubTcpListenerDelete(serviceConfig) + ] + return mergeK8sHubRouterPlans(...plans) +} - if (isK8s && (serviceConfig.type || '').toLowerCase() !== 'k8s') { - await ServicesService._deleteK8sService(serviceConfig.name) +async function reconcileServiceHubDb (serviceConfig, snapshot, transaction) { + if (snapshot && + snapshot.resource != null && + serviceConfig.resource != null && + snapshot.resource !== serviceConfig.resource) { + await deleteHubTcpConnectorDb(buildServiceConfigFromRow(snapshot), transaction) } + + await upsertHubTcpConnectorDb(serviceConfig, transaction) + await upsertHubTcpListenerDb(serviceConfig, transaction) } -async function reconcileService (serviceName, task, transaction) { +async function reconcileServiceDeleteHubDb (serviceConfig, transaction) { + await deleteHubTcpConnectorDb(serviceConfig, transaction) + await deleteHubTcpListenerDb(serviceConfig, transaction) +} + +async function reconcileService (serviceName, task) { const startedAt = Date.now() const isDelete = task && task.reason === 'delete' const snapshot = task ? ServicePlatformReconcileTaskManager.getParsedSpecSnapshot(task) : null const controllerUuid = getControllerUuid() - let hubLockHeld = false - try { + const prep = await runInTransaction(async (transaction) => { let serviceConfig = null let fanOutTags = [] @@ -383,36 +520,57 @@ async function reconcileService (serviceName, task, transaction) { ) } - const isK8s = await ServicesService.checkKubernetesEnvironment() + return { serviceConfig, fanOutTags } + }, { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.prepare' }) - if (isK8s) { - await acquireHubLockWithTimeout(controllerUuid, transaction) - hubLockHeld = true - } + const isK8s = await ServicesService.checkKubernetesEnvironment() - if (isDelete) { - await reconcileServiceDeleteHub(serviceConfig, isK8s, transaction) + try { + if (isK8s) { + await acquireHubLockWithTimeout(controllerUuid) + try { + const hubPlan = await runInTransaction(async (transaction) => { + if (isDelete) { + return reconcileServiceDeleteHub(prep.serviceConfig, transaction) + } + return reconcileServiceHub(prep.serviceConfig, snapshot, transaction) + }, { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.hubReconcile' }) + + await applyK8sHubRouterPlan(hubPlan) + + if (isDelete) { + if ((prep.serviceConfig.type || '').toLowerCase() !== 'k8s') { + await ServicesService._deleteK8sService(prep.serviceConfig.name) + } + } else { + await reconcileK8sServiceExternal(prep.serviceConfig, isK8s) + } + } finally { + await releaseHubLock(controllerUuid) + } } else { - await reconcileServiceHub(serviceConfig, snapshot, transaction) - await reconcileK8sService(serviceConfig, isK8s, transaction) - } - - if (hubLockHeld) { - await HubRouterConfigLockManager.release(controllerUuid, transaction) - hubLockHeld = false + await runInTransaction(async (transaction) => { + if (isDelete) { + await reconcileServiceDeleteHubDb(prep.serviceConfig, transaction) + } else { + await reconcileServiceHubDb(prep.serviceConfig, snapshot, transaction) + } + }, { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.hubDb' }) } - await fanOutFogReconcile(fanOutTags, transaction) - - if (!isDelete) { - await ServiceManager.update( - { name: serviceName }, - { provisioningStatus: 'ready', provisioningError: null }, - transaction - ) - } else if (task && task.id != null) { - await ServicePlatformReconcileTaskManager.delete({ id: task.id }, transaction) - } + await runInTransaction(async (transaction) => { + await fanOutFogReconcile(prep.fanOutTags, transaction) + + if (!isDelete) { + await ServiceManager.update( + { name: serviceName }, + { provisioningStatus: 'ready', provisioningError: null }, + transaction + ) + } else if (task && task.id != null) { + await ServicePlatformReconcileTaskManager.delete({ id: task.id }, transaction) + } + }, { priority: PRIORITY_BACKGROUND, label: 'servicePlatform.finalize' }) logger.info('servicePlatformReconcile completed', { serviceName, @@ -427,17 +585,6 @@ async function reconcileService (serviceName, task, transaction) { provisioningStatus: isDelete ? null : 'ready' } } catch (error) { - if (hubLockHeld) { - try { - await HubRouterConfigLockManager.release(controllerUuid, transaction) - } catch (releaseError) { - logger.warn('servicePlatformReconcile failed to release hub lock', { - serviceName, - error: releaseError.message - }) - } - } - logger.error('servicePlatformReconcile failed', { serviceName, reason: task ? task.reason : null, @@ -448,8 +595,6 @@ async function reconcileService (serviceName, task, transaction) { } } -const bypassOptions = { bypassQueue: true } - module.exports = { normalizeTags, unionTags, @@ -459,7 +604,10 @@ module.exports = { deleteHubTcpConnector, deleteHubTcpListener, acquireHubLockWithTimeout, + releaseHubLock, watchLoadBalancerWithTimeout, fanOutFogReconcile, - reconcileService: TransactionDecorator.generateTransaction(reconcileService, bypassOptions) + applyK8sHubRouterPlan, + applyK8sHubRouterPlanToConfig, + reconcileService } diff --git a/src/services/services-service.js b/src/services/services-service.js index a04bcbee..bd5cf1c8 100644 --- a/src/services/services-service.js +++ b/src/services/services-service.js @@ -13,7 +13,7 @@ const logger = require('../logger') const FogManager = require('../data/managers/iofog-manager') const TagsManager = require('../data/managers/tags-manager') const ChangeTrackingService = require('./change-tracking-service') -const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const ApplicationManager = require('../data/managers/application-manager') const { ensureSystemApplication, @@ -22,7 +22,6 @@ const { const { getServiceAnnotationTag, getComponentLabelKey, getAppLabelKey } = require('../config/flavor') // const { Op } = require('sequelize') -const K8S_ROUTER_CONFIG_MAP = 'iofog-router' const EDGELET_BRIDGE_CONNECTOR_HOST = 'edgelet.default.svc.bridge.local' const INTERIOR_BRIDGE_CONNECTOR_HOST = '127.0.0.1' @@ -42,7 +41,7 @@ async function _setTags (serviceModel, tagsArray, transaction) { } tags.push(tagModel) } - await serviceModel.setTags(tags) + await serviceModel.setTags(tags, { transaction }) } } @@ -83,8 +82,23 @@ function _mergeServiceFieldsForSnapshot (base, patch, snapshotTags) { }) } +function _serviceToSpecSnapshotFields (service) { + return { + name: service.name, + type: service.type, + resource: service.resource, + defaultBridge: service.defaultBridge, + bridgePort: service.bridgePort, + targetPort: service.targetPort, + servicePort: service.servicePort, + k8sType: service.k8sType, + serviceEndpoint: service.serviceEndpoint, + tags: _mapTags(service) + } +} + async function _enqueueServiceReconcileTask (serviceName, reason, specSnapshot, transaction) { - await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + await ReconcileOutboxManager.enqueueServicePlatform({ serviceName, reason, specSnapshot @@ -442,22 +456,6 @@ async function _getRouterMicroservice (fogNodeUuid, transaction) { return routerMicroservice } -// Helper function to update router config in Kubernetes environment -async function _updateK8sRouterConfig (config) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const patchData = { - data: { - 'skrouterd.json': JSON.stringify(config) - } - } - - await K8sClient.patchConfigMap(K8S_ROUTER_CONFIG_MAP, patchData) -} - // Helper function to update router microservice config async function _updateRouterMicroserviceConfig (fogNodeUuid, config, transaction) { const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) @@ -473,455 +471,6 @@ async function _updateRouterMicroserviceConfig (fogNodeUuid, config, transaction await ChangeTrackingService.update(fogNodeUuid, ChangeTrackingService.events.microserviceConfig, transaction) } -// Helper function to add tcpConnector to router config -async function _addTcpConnector (serviceConfig, transaction) { - const isK8s = await checkKubernetesEnvironment() - const targetRouterNode = await _determineConnectorSiteId(serviceConfig, transaction) - const connector = await _buildTcpConnector(serviceConfig, transaction) - - if (targetRouterNode === 'default-router') { - if (isK8s) { - // Update K8s router config - logger.debug('Updating K8s router config') - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - logger.error('ConfigMap not found:' + K8S_ROUTER_CONFIG_MAP) - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - // Add new connector to the array - routerConfig.push(['tcpConnector', connector]) - - await _updateK8sRouterConfig(routerConfig) - } else { - // Update default router microservice config - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - if (!defaultRouter) { - logger.error('Default router not found') - throw new Errors.NotFoundError('Default router not found') - } - const fogNodeUuid = defaultRouter.iofogUuid - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (!currentConfig.bridges) { - currentConfig.bridges = {} - } - if (!currentConfig.bridges.tcpConnectors) { - currentConfig.bridges.tcpConnectors = {} - } - currentConfig.bridges.tcpConnectors[connector.name] = connector - - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } - } else { - // Update specific router microservice config - const fogNodeUuid = targetRouterNode - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (!currentConfig.bridges) { - currentConfig.bridges = {} - } - if (!currentConfig.bridges.tcpConnectors) { - currentConfig.bridges.tcpConnectors = {} - } - currentConfig.bridges.tcpConnectors[connector.name] = connector - - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } -} - -// Helper function to add tcpListener to router config -async function _addTcpListener (serviceConfig, transaction) { - const isK8s = await checkKubernetesEnvironment() - - // First handle K8s case if we're in K8s environment - if (isK8s) { - const k8sListener = _buildTcpListener(serviceConfig) - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - logger.error('ConfigMap not found:' + K8S_ROUTER_CONFIG_MAP) - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - // Add new listener to the array - routerConfig.push(['tcpListener', k8sListener]) - - await _updateK8sRouterConfig(routerConfig) - } - - // Handle distributed router microservice cases - // Get list of fog nodes that need this listener - const fogNodeUuids = await handleServiceDistribution(serviceConfig.tags, transaction) - - // If not in K8s environment, always include default router - if (!isK8s) { - if (serviceConfig.defaultBridge === 'default-router') { - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - if (!defaultRouter) { - logger.error('Default router not found') - throw new Errors.NotFoundError('Default router not found') - } - // Add default router if not already in the list - if (!fogNodeUuids.includes(defaultRouter.iofogUuid)) { - fogNodeUuids.push(defaultRouter.iofogUuid) - } - } else { - if (!fogNodeUuids.includes(serviceConfig.defaultBridge)) { - fogNodeUuids.push(serviceConfig.defaultBridge) - } - } - } - // else if (!fogNodeUuids || fogNodeUuids.length === 0) { - // // If in K8s and no fog nodes found, add default router - // const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - // if (!defaultRouter) { - // logger.error('Default router not found') - // throw new Errors.NotFoundError('Default router not found') - // } - // fogNodeUuids.push(defaultRouter.iofogUuid) - // } - - // Add listener to each router microservice - for (const fogNodeUuid of fogNodeUuids) { - try { - const listener = _buildTcpListener(serviceConfig) - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - if (!currentConfig.bridges) currentConfig.bridges = {} - if (!currentConfig.bridges.tcpListeners) currentConfig.bridges.tcpListeners = {} - currentConfig.bridges.tcpListeners[listener.name] = listener - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } catch (err) { - if (err instanceof Errors.NotFoundError) { - logger.info(`Router microservice not found for fogNodeUuid ${fogNodeUuid}, skipping.`) - continue - } - throw err - } - } -} - -// Helper function to update tcpConnector in router config -async function _updateTcpConnector (serviceConfig, transaction) { - const isK8s = await checkKubernetesEnvironment() - const targetRouterNode = await _determineConnectorSiteId(serviceConfig, transaction) - const connector = await _buildTcpConnector(serviceConfig, transaction) - - if (targetRouterNode === 'default-router') { - if (isK8s) { - // Update K8s router config - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - // Find and update the existing connector - const connectorIndex = routerConfig.findIndex(item => - item[0] === 'tcpConnector' && item[1].name === connector.name - ) - if (connectorIndex !== -1) { - routerConfig[connectorIndex] = ['tcpConnector', connector] - } - - await _updateK8sRouterConfig(routerConfig) - } else { - // Update default router microservice config - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - if (!defaultRouter) { - throw new Errors.NotFoundError('Default router not found') - } - const fogNodeUuid = defaultRouter.iofogUuid - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (!currentConfig.bridges) { - currentConfig.bridges = {} - } - if (!currentConfig.bridges.tcpConnectors) { - currentConfig.bridges.tcpConnectors = {} - } - // Update the connector with the same name - currentConfig.bridges.tcpConnectors[connector.name] = connector - - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } - } else { - // Update specific router microservice config - const fogNodeUuid = targetRouterNode - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (!currentConfig.bridges) { - currentConfig.bridges = {} - } - if (!currentConfig.bridges.tcpConnectors) { - currentConfig.bridges.tcpConnectors = {} - } - // Update the connector with the same name - currentConfig.bridges.tcpConnectors[connector.name] = connector - - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } -} - -// // Helper function to update tcpListener in router config -// async function _updateTcpListener (serviceConfig, transaction) { -// const isK8s = await checkKubernetesEnvironment() - -// // First handle K8s case if we're in K8s environment -// if (isK8s) { -// const k8sListener = await _buildTcpListener(serviceConfig, null) // null for K8s case -// const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) -// if (!configMap) { -// throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) -// } - -// const routerConfig = JSON.parse(configMap.data['skrouterd.json']) -// // Update the listener in the array -// const listenerIndex = routerConfig.findIndex(item => -// item[0] === 'tcpListener' && item[1].name === k8sListener.name -// ) -// if (listenerIndex !== -1) { -// routerConfig[listenerIndex] = ['tcpListener', k8sListener] -// } else { -// routerConfig.push(['tcpListener', k8sListener]) -// } - -// await _updateK8sRouterConfig(routerConfig) -// } - -// // Handle distributed router microservice cases -// // Get list of fog nodes that need this listener -// const fogNodeUuids = await handleServiceDistribution(serviceConfig.tags, transaction) -// // If not in K8s environment, always include default router -// if (!isK8s) { -// const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) -// if (!defaultRouter) { -// throw new Errors.NotFoundError('Default router not found') -// } -// // Add default router if not already in the list -// if (!fogNodeUuids.includes(defaultRouter.iofogUuid)) { -// fogNodeUuids.push(defaultRouter.iofogUuid) -// } -// } -// // else if (!fogNodeUuids || fogNodeUuids.length === 0) { -// // // If in K8s and no fog nodes found, add default router -// // const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) -// // if (!defaultRouter) { -// // throw new Errors.NotFoundError('Default router not found') -// // } -// // fogNodeUuids.push(defaultRouter.iofogUuid) -// // } - -// // Update listener in each router microservice -// for (const fogNodeUuid of fogNodeUuids) { -// try { -// const listener = await _buildTcpListener(serviceConfig, fogNodeUuid) -// const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) -// const currentConfig = JSON.parse(routerMicroservice.config || '{}') - -// if (!currentConfig.bridges) { -// currentConfig.bridges = {} -// } -// if (!currentConfig.bridges.tcpListeners) { -// currentConfig.bridges.tcpListeners = {} -// } -// // Update listener with its name as key -// currentConfig.bridges.tcpListeners[listener.name] = listener - -// await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) -// } catch (err) { -// if (err instanceof Errors.NotFoundError) { -// logger.info(`Router microservice not found for fogNodeUuid ${fogNodeUuid}, skipping.`) -// continue -// } -// throw err -// } -// } -// } - -// Helper function to delete tcpConnector from router config -async function _deleteTcpConnector (serviceName, transaction) { - logger.debug('_deleteTcpConnector: start', { serviceName }) - const isK8s = await checkKubernetesEnvironment() - const connectorName = `${serviceName}-connector` - - // Get service to determine if it's using default router - const service = await ServiceManager.findOne({ name: serviceName }, transaction) - if (!service) { - throw new Errors.NotFoundError(`Service not found: ${serviceName}`) - } - logger.debug('_deleteTcpConnector: service', { type: service.type, resource: service.resource, defaultBridge: service.defaultBridge }) - - const isDefaultRouter = service.defaultBridge === 'default-router' - let microserviceSource = null - if (service.type === 'microservice') { - microserviceSource = await MicroserviceManager.findOne({ uuid: service.resource }, transaction) - } - let fogSource = null - if (service.type === 'agent') { - fogSource = await FogManager.findOne({ uuid: service.resource }, transaction) - if (!fogSource) { - fogSource = await FogManager.findOne({ name: service.resource }, transaction) - } - } - - if (isDefaultRouter && (!microserviceSource || !fogSource)) { - logger.debug('_deleteTcpConnector: updating default router config') - if (isK8s) { - // Update K8s router config - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - // Remove the connector from the array - const updatedConfig = routerConfig.filter(item => - !(item[0] === 'tcpConnector' && item[1].name === connectorName) - ) - - await _updateK8sRouterConfig(updatedConfig) - } else { - // Update default router microservice config - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - if (!defaultRouter) { - throw new Errors.NotFoundError('Default router not found') - } - const fogNodeUuid = defaultRouter.iofogUuid - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (currentConfig.bridges && currentConfig.bridges.tcpConnectors) { - delete currentConfig.bridges.tcpConnectors[connectorName] - } - - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } - logger.debug('_deleteTcpConnector: done (default router updated)') - return - } - - let fogNodeUuid = null - if (!isDefaultRouter && (!microserviceSource || !fogSource)) { - fogNodeUuid = service.defaultBridge - } - if (microserviceSource) { - fogNodeUuid = microserviceSource.iofogUuid - } - if (fogSource) { - fogNodeUuid = fogSource.uuid - } - logger.debug('_deleteTcpConnector: fogNodeUuid for non-default', { fogNodeUuid }) - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - if (currentConfig.bridges && currentConfig.bridges.tcpConnectors) { - delete currentConfig.bridges.tcpConnectors[connectorName] - } - - logger.debug('_deleteTcpConnector: updating router config', { fogNodeUuid }) - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - logger.debug('_deleteTcpConnector: done') -} - -// Helper function to delete tcpListener from router config -async function _deleteTcpListener (serviceName, transaction) { - logger.debug('_deleteTcpListener: start', { serviceName }) - const isK8s = await checkKubernetesEnvironment() - const listenerName = `${serviceName}-listener` - - // First handle K8s case if we're in K8s environment - if (isK8s) { - const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) - if (!configMap) { - throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) - } - - const routerConfig = JSON.parse(configMap.data['skrouterd.json']) - // Remove the listener from the array - const updatedConfig = routerConfig.filter(item => - !(item[0] === 'tcpListener' && item[1].name === listenerName) - ) - - await _updateK8sRouterConfig(updatedConfig) - } - - // Get service to determine its tags for distribution - const service = await ServiceManager.findOneWithTags({ name: serviceName }, transaction) - if (!service) { - throw new Errors.NotFoundError(`Service not found: ${serviceName}`) - } - logger.debug('_deleteTcpListener: service', { type: service.type, hasTags: !!service.tags, tagsIsArray: Array.isArray(service.tags) }) - - let microserviceSource = null - if (service.type === 'microservice') { - microserviceSource = await MicroserviceManager.findOne({ uuid: service.resource }, transaction) - } - // Handle distributed router microservice cases - // Get list of fog nodes that need this listener removed - const serviceTags = (service.tags && Array.isArray(service.tags)) ? service.tags.map(tag => tag.value) : [] - logger.debug('_deleteTcpListener: calling handleServiceDistribution', { serviceTagsLength: serviceTags.length, serviceTagsSample: serviceTags.slice(0, 3) }) - const fogNodeUuids = await handleServiceDistribution(serviceTags, transaction) - logger.debug('_deleteTcpListener: handleServiceDistribution returned', { fogNodeUuidsLength: fogNodeUuids ? fogNodeUuids.length : 'null/undefined', isArray: Array.isArray(fogNodeUuids) }) - - if (microserviceSource) { - if (!fogNodeUuids.includes(microserviceSource.iofogUuid)) { - fogNodeUuids.push(microserviceSource.iofogUuid) - } - } - // If not in K8s environment, always include default router - if (!isK8s) { - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - if (!defaultRouter) { - throw new Errors.NotFoundError('Default router not found') - } - // Add default router if not already in the list - if (!fogNodeUuids.includes(defaultRouter.iofogUuid)) { - fogNodeUuids.push(defaultRouter.iofogUuid) - } - } - // else if (!fogNodeUuids || fogNodeUuids.length === 0) { - // // If in K8s and no fog nodes found, add default router - // const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - // if (!defaultRouter) { - // throw new Errors.NotFoundError('Default router not found') - // } - // fogNodeUuids.push(defaultRouter.iofogUuid) - // } - - // Remove listener from each router microservice - const fogList = Array.isArray(fogNodeUuids) ? fogNodeUuids : [] - logger.debug('_deleteTcpListener: iterating router configs', { count: fogList.length }) - for (const fogNodeUuid of fogList) { - try { - const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) - const currentConfig = JSON.parse(routerMicroservice.config || '{}') - if (currentConfig.bridges && currentConfig.bridges.tcpListeners) { - delete currentConfig.bridges.tcpListeners[listenerName] - } - await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) - } catch (err) { - if (err instanceof Errors.NotFoundError) { - logger.info('_deleteTcpListener: router microservice not found, skipping', { fogNodeUuid }) - continue - } - logger.error({ - err, - msg: '_deleteTcpListener: error updating router config', - fogNodeUuid - }) - throw err - } - } - logger.debug('_deleteTcpListener: done') -} - // Common labels for Kubernetes services created by the controller function _getK8sServiceLabels () { const componentLabelKey = getComponentLabelKey() @@ -935,60 +484,41 @@ function _getK8sServiceLabels () { } } -// Helper function to create Kubernetes service -async function _createK8sService (serviceConfig, transaction) { - const normalizedTags = serviceConfig.tags.map(tag => tag.includes(':') ? tag : `${tag}:`) - const componentLabelKey = getComponentLabelKey() - const serviceSpec = { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: serviceConfig.name, - labels: _getK8sServiceLabels(), - annotations: normalizedTags.reduce((acc, tag) => { - const [key, value] = tag.split(':') - acc[key] = (value || '').trim() - return acc - }, {}) - }, - spec: { - type: serviceConfig.k8sType, - selector: { - [componentLabelKey]: 'router' +// Helper function to create or update a Kubernetes service resource (I/O only; no DB). +// Returns LoadBalancer IP when assigned, otherwise null. +async function _syncK8sServiceResource (serviceConfig) { + const existingService = await K8sClient.getService(serviceConfig.name, { ignoreNotFound: true }) + if (!existingService) { + logger.debug(`Service not found: ${serviceConfig.name}, creating new service`) + const normalizedTags = serviceConfig.tags.map(tag => tag.includes(':') ? tag : `${tag}:`) + const componentLabelKey = getComponentLabelKey() + const serviceSpec = { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: serviceConfig.name, + labels: _getK8sServiceLabels(), + annotations: normalizedTags.reduce((acc, tag) => { + const [key, value] = tag.split(':') + acc[key] = (value || '').trim() + return acc + }, {}) }, - ports: [{ - name: 'iofog-service', - targetPort: parseInt(serviceConfig.bridgePort), - port: parseInt(serviceConfig.servicePort), - protocol: 'TCP' - }] - } - } - - const service = await K8sClient.createService(serviceSpec) - - // If LoadBalancer type, wait for and set the external IP - if (serviceConfig.k8sType === 'LoadBalancer') { - const loadBalancerIP = await K8sClient.watchLoadBalancerIP(serviceConfig.name) - if (loadBalancerIP) { - await ServiceManager.update( - { name: serviceConfig.name }, - { serviceEndpoint: loadBalancerIP }, - transaction - ) + spec: { + type: serviceConfig.k8sType, + selector: { + [componentLabelKey]: 'router' + }, + ports: [{ + name: 'iofog-service', + targetPort: parseInt(serviceConfig.bridgePort), + port: parseInt(serviceConfig.servicePort), + protocol: 'TCP' + }] + } } - } - return service -} - -// Helper function to update Kubernetes service -async function _updateK8sService (serviceConfig, transaction) { - const existingService = await K8sClient.getService(serviceConfig.name) - if (!existingService) { - logger.debug(`Service not found: ${serviceConfig.name}, creating new service`) - const service = await _createK8sService(serviceConfig, transaction) - return service + await K8sClient.createService(serviceSpec) } else { const normalizedTags = serviceConfig.tags.map(tag => tag.includes(':') ? tag : `${tag}:`) const componentLabelKey = getComponentLabelKey() @@ -1016,20 +546,38 @@ async function _updateK8sService (serviceConfig, transaction) { } logger.debug(`Updating service: ${serviceConfig.name}`) - const updatedService = await K8sClient.updateService(serviceConfig.name, patchData) - - // If LoadBalancer type, wait for and set the external IP - if (serviceConfig.k8sType === 'LoadBalancer') { - const loadBalancerIP = await K8sClient.watchLoadBalancerIP(serviceConfig.name) - if (loadBalancerIP) { - await ServiceManager.update( - { name: serviceConfig.name }, - { serviceEndpoint: loadBalancerIP }, - transaction - ) - } - } - return updatedService + await K8sClient.updateService(serviceConfig.name, patchData) + } + + if (serviceConfig.k8sType === 'LoadBalancer') { + return K8sClient.watchLoadBalancerIP(serviceConfig.name) + } + + return null +} + +// Helper function to create Kubernetes service +async function _createK8sService (serviceConfig, transaction) { + const loadBalancerIP = await _syncK8sServiceResource(serviceConfig) + if (loadBalancerIP) { + await ServiceManager.update( + { name: serviceConfig.name }, + { serviceEndpoint: loadBalancerIP }, + transaction + ) + } + return loadBalancerIP +} + +// Helper function to update Kubernetes service +async function _updateK8sService (serviceConfig, transaction) { + const loadBalancerIP = await _syncK8sServiceResource(serviceConfig) + if (loadBalancerIP) { + await ServiceManager.update( + { name: serviceConfig.name }, + { serviceEndpoint: loadBalancerIP }, + transaction + ) } } @@ -1226,10 +774,7 @@ async function deleteServiceEndpoint (serviceName, transaction) { } logger.debug('deleteServiceEndpoint: existingService', { type: existingService.type, defaultBridge: existingService.defaultBridge }) - const specSnapshot = _buildServiceSpecSnapshot({ - ...existingService, - tags: _mapTags(existingService) - }) + const specSnapshot = _buildServiceSpecSnapshot(_serviceToSpecSnapshotFields(existingService)) await _enqueueServiceReconcileTask(serviceName, 'delete', specSnapshot, transaction) logger.debug('deleteServiceEndpoint: deleting service from DB') @@ -1255,10 +800,7 @@ async function reconcileServiceEndpoint (serviceName, transaction) { service.provisioningError = null } - const specSnapshot = _buildServiceSpecSnapshot({ - ...service, - tags: _mapTags(service) - }) + const specSnapshot = _buildServiceSpecSnapshot(_serviceToSpecSnapshotFields(service)) await _enqueueServiceReconcileTask(serviceName, 'manual-retry', specSnapshot, transaction) return { @@ -1365,6 +907,7 @@ module.exports = { _mapTags, _setTags: TransactionDecorator.generateTransaction(_setTags), _createK8sService, + _syncK8sServiceResource, _updateK8sService, _deleteK8sService, createServiceEndpoint: TransactionDecorator.generateTransaction(createServiceEndpoint), @@ -1378,10 +921,5 @@ module.exports = { _determineConnectorSiteId, _buildTcpConnector, _buildTcpListener, - _addTcpConnector, - _addTcpListener, - _updateTcpConnector, - _deleteTcpConnector, - _deleteTcpListener, _resolveFogRouterMode } diff --git a/src/utils/cert.js b/src/utils/cert.js index 04b0083a..ee267131 100644 --- a/src/utils/cert.js +++ b/src/utils/cert.js @@ -2,6 +2,7 @@ const forge = require('node-forge') const k8sClient = require('./k8s-client') const BigNumber = require('bignumber.js') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') // Types for CA input const CA_TYPES = { @@ -81,7 +82,7 @@ async function validateCA (cert, key) { * @param {string} name - Name of the secret * @returns {Promise} */ -async function storeCA (ca, name) { +async function storeCA (ca, name, transaction) { try { // Ensure data is in base64 format for TLS secrets const secretData = { @@ -98,7 +99,7 @@ async function storeCA (ca, name) { // Use the secret service to store the CA const SecretService = require('../services/secret-service') - await SecretService.createSecretEndpoint(secret) + await SecretService.createSecretEndpoint(secret, transaction) } catch (error) { throw new Error(`Failed to store CA: ${error.message}`) } @@ -107,15 +108,20 @@ async function storeCA (ca, name) { /** * Loads CA certificate and key from internal secret storage * @param {string} name - Name of the secret + * @param {import('sequelize').Transaction} [transaction] * @returns {Promise} */ -async function loadCA (name) { +async function loadCA (name, transaction) { try { // Use SecretManager to get the secret with decryption handling const SecretManager = require('../data/managers/secret-manager') - const fakeTransaction = { fakeTransaction: true } - const secret = await SecretManager.getSecret(name, fakeTransaction) + const secret = transaction + ? await SecretManager.getSecret(name, transaction) + : await runInTransaction( + (tx) => SecretManager.getSecret(name, tx), + { priority: PRIORITY_BACKGROUND, label: 'cert-load-ca' } + ) if (!secret) { throw new Error(`TLS secret with name ${name} not found`) } @@ -233,7 +239,7 @@ async function generateSelfSignedCA (subject, expiration = 5 * 365 * 24 * 60 * 6 } // CA handling functions -async function getCAFromK8sSecret (secretName) { +async function getCAFromK8sSecret (secretName, transaction) { try { // Check that k8sClient is properly required and available if (!k8sClient) { @@ -257,28 +263,37 @@ async function getCAFromK8sSecret (secretName) { try { // Use SecretManager to check if there's a local secret const SecretManager = require('../data/managers/secret-manager') - const localSecret = await SecretManager.findOne({ name: secretName }, { fakeTransaction: true }) + const localSecret = transaction + ? await SecretManager.findOne({ name: secretName }, transaction) + : await runInTransaction( + (tx) => SecretManager.findOne({ name: secretName }, tx), + { priority: PRIORITY_BACKGROUND, label: 'cert-k8s-local-secret' } + ) - // If no local secret, we need to create one if (!localSecret) { - // Store the CA in local secret storage - await storeCA({ cert, key }, secretName) - // Also create a certificate record + await storeCA({ cert, key }, secretName, transaction) const CertificateManager = require('../data/managers/certificate-manager') const forge = require('node-forge') const forgeCert = forge.pki.certificateFromPem(cert) - // Extract subject const subject = forgeCert.subject.getField('CN') ? forgeCert.subject.getField('CN').value : secretName - // Create CA record - await CertificateManager.createCertificateRecord({ + const caRecord = { name: secretName, subject, isCA: true, validFrom: forgeCert.validity.notBefore, validTo: forgeCert.validity.notAfter, serialNumber: forgeCert.serialNumber - }, { fakeTransaction: true }) + } + + if (transaction) { + await CertificateManager.createCertificateRecord(caRecord, transaction) + } else { + await runInTransaction( + (tx) => CertificateManager.createCertificateRecord(caRecord, tx), + { priority: PRIORITY_BACKGROUND, label: 'cert-k8s-create-ca-record' } + ) + } } } catch (dbError) { // Continue anyway - we at least have the cert/key @@ -309,7 +324,7 @@ async function getCAFromDirect (ca) { } } -async function getCAFromInput (ca) { +async function getCAFromInput (ca, transaction) { if (!ca) { return null } @@ -319,11 +334,11 @@ async function getCAFromInput (ca) { switch (caType) { case CA_TYPES.K8S_SECRET.toLowerCase(): - return getCAFromK8sSecret(ca.secretName) + return getCAFromK8sSecret(ca.secretName, transaction) case CA_TYPES.DIRECT.toLowerCase(): if (ca.secretName) { // If secretName is provided, load from internal secret storage - const caData = await loadCA(ca.secretName) + const caData = await loadCA(ca.secretName, transaction) return getCAFromDirect(caData) } return getCAFromDirect(ca) @@ -345,7 +360,8 @@ async function generateCertificate ({ hosts, expiration = 5 * 365 * 24 * 60 * 60 * 1000, ca, - isRenewal = false + isRenewal = false, + transaction }) { try { return await _generateCertificateBody({ @@ -354,7 +370,8 @@ async function generateCertificate ({ hosts, expiration, ca, - isRenewal + isRenewal, + transaction }) } catch (error) { logger.error(`Certificate generation failed for ${name}:`, error.message) @@ -368,9 +385,10 @@ async function _generateCertificateBody ({ hosts, expiration, ca, - isRenewal + isRenewal, + transaction }) { - const caCert = await getCAFromInput(ca) + const caCert = await getCAFromInput(ca, transaction) // Generate RSA key pair const keys = forge.pki.rsa.generateKeyPair(2048) @@ -512,7 +530,7 @@ async function _generateCertificateBody ({ if (isRenewal) { // For renewals, delete the existing secret first try { - await SecretService.deleteSecretEndpoint(name) + await SecretService.deleteSecretEndpoint(name, transaction) } catch (error) { // If the secret doesn't exist, that's okay, just continue if (error.name !== 'NotFoundError') { @@ -522,7 +540,7 @@ async function _generateCertificateBody ({ } // Create new secret with certificate data - await SecretService.createSecretEndpoint(secret) + await SecretService.createSecretEndpoint(secret, transaction) return { cert: certPem, From 81a2bcde1991949a6dc462076b04a73d6c325abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:40:42 +0300 Subject: [PATCH 08/32] Fix SQLite deadlocks in auth, WebSocket sessions, and background jobs. Isolate background transactions from committed ALS context, defer relay setup until after DB commit, propagate fog-token lastActive in parent tx, and route remaining services through runInTransaction. --- src/decorators/authorization-decorator.js | 28 +-- src/helpers/app-helper.js | 5 +- src/jobs/controller-cleanup-job.js | 36 ++-- src/jobs/controller-heartbeat-job.js | 9 +- src/jobs/event-cleanup-job.js | 12 +- src/jobs/fog-status-job.js | 6 +- src/jobs/fog-token-cleanup-job.js | 10 +- src/jobs/nats-reconcile-worker-job.js | 6 +- src/jobs/stopped-app-status-job.js | 11 +- src/jobs/ws-session-reconcile-job.js | 196 +++++++++--------- src/lib/rbac/middleware.js | 64 +++--- src/middlewares/event-audit-middleware.js | 4 +- src/services/agent-service.js | 2 +- src/services/application-service.js | 12 +- src/services/auth-bootstrap-service.js | 153 +++++++------- src/services/auth-interaction-service.js | 166 ++++++++------- src/services/auth-oauth-service.js | 24 ++- src/services/controller-ms-service.js | 12 +- src/services/event-service.js | 31 +-- .../microservice-ports/microservice-port.js | 6 +- src/services/microservices-service.js | 36 ++-- src/services/nats-relay-connection-manager.js | 85 ++++---- src/services/rbac-service.js | 19 +- src/services/router-connection-manager.js | 79 +++---- src/services/user-service.js | 185 ++++++++++------- src/services/volume-mount-service.js | 6 +- src/websocket/exec-session-manager.js | 12 +- src/websocket/log-session-manager.js | 12 +- src/websocket/server.js | 95 ++++++--- 29 files changed, 718 insertions(+), 604 deletions(-) diff --git a/src/decorators/authorization-decorator.js b/src/decorators/authorization-decorator.js index dc10a692..35e93e03 100644 --- a/src/decorators/authorization-decorator.js +++ b/src/decorators/authorization-decorator.js @@ -3,6 +3,7 @@ const FogManager = require('../data/managers/iofog-manager') const FogKeyService = require('../services/iofog-key-service') const Errors = require('../helpers/errors') const { isTest } = require('../helpers/app-helper') +const { runInTransaction } = require('../helpers/transaction-runner') function checkFogToken (f) { return async function (...fArgs) { @@ -18,7 +19,6 @@ function checkFogToken (f) { throw new Errors.AuthenticationError('authorization failed') } - // Extract token from Bearer scheme const [scheme, token] = authHeader.split(' ') if (scheme.toLowerCase() !== 'bearer' || !token) { logger.error('Invalid authorization scheme') @@ -26,10 +26,8 @@ function checkFogToken (f) { } try { - // Debug log for JWT logger.debug({ token }, 'Received JWT') - // First, decode the JWT without verification to get the fog UUID const tokenParts = token.split('.') if (tokenParts.length !== 3) { logger.error('Invalid JWT format') @@ -46,23 +44,25 @@ function checkFogToken (f) { throw new Errors.AuthenticationError('authorization failed') } - // Get the fog with transaction - const fog = await FogManager.findOne({ - uuid: fogUuid - }, { fakeTransaction: true }) + const fog = await runInTransaction(async (transaction) => { + const foundFog = await FogManager.findOne({ uuid: fogUuid }, transaction) + if (!foundFog) { + return null + } + + await FogKeyService.verifyJWT(token, fogUuid, transaction) + + const timestamp = Date.now() + await FogManager.updateLastActive(foundFog.uuid, timestamp, transaction) + + return foundFog + }, { label: 'checkFogToken' }) if (!fog) { logger.error(`Fog with UUID ${fogUuid} not found`) throw new Errors.AuthenticationError('authorization failed') } - // Verify the JWT with transaction - await FogKeyService.verifyJWT(token, fogUuid, { fakeTransaction: true }) - - // Update last active timestamp with transaction - const timestamp = Date.now() - await FogManager.updateLastActive(fog.uuid, timestamp, { fakeTransaction: true }) - fArgs.push(fog) return f.apply(this, fArgs) diff --git a/src/helpers/app-helper.js b/src/helpers/app-helper.js index 6df81381..b4b420ec 100644 --- a/src/helpers/app-helper.js +++ b/src/helpers/app-helper.js @@ -112,14 +112,13 @@ function checkTransaction (transaction) { if (isTest()) { return } - // TODO [when transactions concurrency issue fixed]: Remove '!transaction.fakeTransaction' - if (!transaction || (!(transaction instanceof Transaction) && !transaction.fakeTransaction)) { + if (!transaction || !(transaction instanceof Transaction)) { throw new Errors.TransactionError() } } function withTransaction (transaction, options = {}) { - if (transaction && !transaction.fakeTransaction) { + if (transaction) { options.transaction = transaction } return options diff --git a/src/jobs/controller-cleanup-job.js b/src/jobs/controller-cleanup-job.js index 732693b3..0d365a76 100644 --- a/src/jobs/controller-cleanup-job.js +++ b/src/jobs/controller-cleanup-job.js @@ -3,6 +3,7 @@ const Config = require('../config') const logger = require('../logger') const Sequelize = require('sequelize') const Op = Sequelize.Op +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') async function run () { try { @@ -10,7 +11,6 @@ async function run () { } catch (error) { logger.error('Error during controller cleanup:', error) } finally { - // Schedule next run with current interval (may have changed via env var) const currentInterval = process.env.CONTROLLER_CLEANUP_INTERVAL || Config.get('settings.controllerCleanupInterval', 600) setTimeout(run, currentInterval * 1000) } @@ -23,22 +23,24 @@ async function cleanupInactiveControllers () { logger.debug(`Starting cleanup of controllers inactive for more than ${thresholdSeconds} seconds`) - const fakeTransaction = { fakeTransaction: true } - const inactive = await ClusterControllerManager.findAll({ - isActive: true, - lastHeartbeat: { [Op.lt]: threshold } - }, fakeTransaction) - - let cleanedCount = 0 - for (const controller of inactive) { - await ClusterControllerManager.update( - { uuid: controller.uuid }, - { isActive: false }, - fakeTransaction - ) - logger.info(`Marked controller ${controller.uuid} on host ${controller.host} as inactive (last heartbeat: ${controller.lastHeartbeat})`) - cleanedCount++ - } + const cleanedCount = await runInTransaction(async (transaction) => { + const inactive = await ClusterControllerManager.findAll({ + isActive: true, + lastHeartbeat: { [Op.lt]: threshold } + }, transaction) + + let count = 0 + for (const controller of inactive) { + await ClusterControllerManager.update( + { uuid: controller.uuid }, + { isActive: false }, + transaction + ) + logger.info(`Marked controller ${controller.uuid} on host ${controller.host} as inactive (last heartbeat: ${controller.lastHeartbeat})`) + count++ + } + return count + }, { priority: PRIORITY_BACKGROUND, label: 'controller-cleanup' }) if (cleanedCount > 0) { logger.info(`Cleaned up ${cleanedCount} inactive controller(s)`) diff --git a/src/jobs/controller-heartbeat-job.js b/src/jobs/controller-heartbeat-job.js index 85517ad8..4d5f605c 100644 --- a/src/jobs/controller-heartbeat-job.js +++ b/src/jobs/controller-heartbeat-job.js @@ -1,6 +1,8 @@ const ClusterControllerService = require('../services/cluster-controller-service') const Config = require('../config') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') +const { checkSqliteFogCountWarning } = require('../helpers/sqlite-fog-warning') const scheduleTime = (Config.get('settings.controllerHeartbeatInterval', 30)) * 1000 @@ -22,8 +24,11 @@ async function updateControllerHeartbeat () { return } - const fakeTransaction = { fakeTransaction: true } - await ClusterControllerService.updateHeartbeat(uuid, fakeTransaction) + await runInTransaction( + (transaction) => ClusterControllerService.updateHeartbeat(uuid, transaction), + { priority: PRIORITY_BACKGROUND, label: 'controller-heartbeat' } + ) + await checkSqliteFogCountWarning() logger.debug(`Updated heartbeat for controller: ${uuid}`) } catch (error) { logger.error(`Failed to update controller heartbeat: ${error.message}`) diff --git a/src/jobs/event-cleanup-job.js b/src/jobs/event-cleanup-job.js index 64dc8d61..81878580 100644 --- a/src/jobs/event-cleanup-job.js +++ b/src/jobs/event-cleanup-job.js @@ -2,6 +2,7 @@ const EventManager = require('../data/managers/event-manager') const EventService = require('../services/event-service') const Config = require('../config') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') async function run () { try { @@ -9,7 +10,6 @@ async function run () { } catch (error) { logger.error('Error during event cleanup:', error) } finally { - // Schedule next run with current interval (may have changed via env var) const currentInterval = process.env.EVENT_CLEANUP_INTERVAL || Config.get('settings.eventCleanupInterval', 86400) setTimeout(run, currentInterval * 1000) } @@ -17,15 +17,15 @@ async function run () { async function cleanupOldEvents () { try { - // Read retention days from config const retentionDays = process.env.EVENT_RETENTION_DAYS || Config.get('settings.eventRetentionDays', 7) logger.debug(`Starting cleanup of events older than ${retentionDays} days`) - const count = await EventManager.deleteEventsOlderThanDays(retentionDays, { fakeTransaction: true }) + const count = await runInTransaction( + (transaction) => EventManager.deleteEventsOlderThanDays(retentionDays, transaction), + { priority: PRIORITY_BACKGROUND, label: 'event-cleanup' } + ) logger.info(`Cleaned up ${count} events older than ${retentionDays} days`) - // Create audit trail for automated cleanup (non-blocking) - // This allows admins to distinguish between manual deletions and automated cleanup if (count > 0) { setImmediate(async () => { try { @@ -43,7 +43,7 @@ async function cleanupOldEvents () { statusCode: 200, statusMessage: `Automated cleanup: Deleted ${count} events older than ${retentionDays} days`, requestId: null - }, { fakeTransaction: true }).catch(err => { + }).catch(err => { logger.error('Failed to create cleanup job audit record (non-blocking):', err) }) } catch (error) { diff --git a/src/jobs/fog-status-job.js b/src/jobs/fog-status-job.js index 11fd62ac..118c0143 100644 --- a/src/jobs/fog-status-job.js +++ b/src/jobs/fog-status-job.js @@ -1,4 +1,5 @@ const TransactionDecorator = require('../decorators/transaction-decorator') +const { PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const FogManager = require('../data/managers/iofog-manager') const MicroserviceManager = require('../data/managers/microservice-manager') @@ -14,7 +15,10 @@ const scheduleTime = Config.get('settings.fogStatusUpdateInterval') * 1000 async function run () { try { - const _updateFogsConnectionStatus = TransactionDecorator.generateTransaction(updateFogsConnectionStatus) + const _updateFogsConnectionStatus = TransactionDecorator.generateTransaction( + updateFogsConnectionStatus, + { priority: PRIORITY_BACKGROUND, label: 'fogStatus.updateConnection' } + ) await _updateFogsConnectionStatus() } catch (error) { logger.error('Error during fog status update:', error) diff --git a/src/jobs/fog-token-cleanup-job.js b/src/jobs/fog-token-cleanup-job.js index cb433b4a..4fcb448c 100644 --- a/src/jobs/fog-token-cleanup-job.js +++ b/src/jobs/fog-token-cleanup-job.js @@ -1,6 +1,7 @@ const FogUsedTokenManager = require('../data/managers/fog-used-token-manager') const Config = require('../config') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const scheduleTime = Config.get('settings.fogExpiredTokenCleanupInterval') * 1000 @@ -8,7 +9,7 @@ async function run () { try { await cleanupExpiredTokens() } catch (error) { - logger.error('Error during JTI cleanup:', error) + logger.error({ err: error }, 'Error during JTI cleanup') } finally { setTimeout(run, scheduleTime) } @@ -17,10 +18,13 @@ async function run () { async function cleanupExpiredTokens () { try { logger.debug('Starting cleanup of expired JTIs') - const count = await FogUsedTokenManager.cleanupExpiredJtis() + const count = await runInTransaction( + (transaction) => FogUsedTokenManager.cleanupExpiredJtis(transaction), + { priority: PRIORITY_BACKGROUND, label: 'fogToken.cleanupExpiredJtis' } + ) logger.debug(`Cleaned up ${count} expired JTIs`) } catch (error) { - logger.error('Error during JTI cleanup:', error) + logger.error({ err: error }, 'Error during JTI cleanup') } } diff --git a/src/jobs/nats-reconcile-worker-job.js b/src/jobs/nats-reconcile-worker-job.js index f785a45f..b7fabf58 100644 --- a/src/jobs/nats-reconcile-worker-job.js +++ b/src/jobs/nats-reconcile-worker-job.js @@ -1,9 +1,9 @@ const ClusterControllerService = require('../services/cluster-controller-service') const NatsService = require('../services/nats-service') const NatsReconcileTaskManager = require('../data/managers/nats-reconcile-task-manager') -const databaseProvider = require('../data/providers/database-factory') const Config = require('../config') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const scheduleTime = (Config.get('settings.natsReconcileWorkerIntervalSeconds', 3)) * 1000 @@ -50,12 +50,12 @@ async function processNextTask () { logger.info(`NATS reconcile task ${task.id} started`) await NatsService.reconcileResolverArtifacts(options) logger.info(`NATS reconcile task ${task.id} completed`) - await databaseProvider.sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await NatsReconcileTaskManager.getEntity().destroy({ where: { id: task.id }, transaction }) - }) + }, { priority: PRIORITY_BACKGROUND, label: 'natsReconcile.taskComplete' }) } catch (error) { logger.error({ err: error, diff --git a/src/jobs/stopped-app-status-job.js b/src/jobs/stopped-app-status-job.js index 36057e86..00d752c7 100644 --- a/src/jobs/stopped-app-status-job.js +++ b/src/jobs/stopped-app-status-job.js @@ -1,4 +1,5 @@ const TransactionDecorator = require('../decorators/transaction-decorator') +const { PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const MicroserviceManager = require('../data/managers/microservice-manager') const MicroserviceStatusManager = require('../data/managers/microservice-status-manager') @@ -13,8 +14,14 @@ const scheduleTime = Config.get('settings.fogStatusUpdateInterval') * 1000 async function run () { try { - const _updateStoppedApplicationMicroserviceStatus = TransactionDecorator.generateTransaction(updateStoppedApplicationMicroserviceStatus) - const _updateStoppedMicroserviceStatus = TransactionDecorator.generateTransaction(updateStoppedMicroserviceStatus) + const _updateStoppedApplicationMicroserviceStatus = TransactionDecorator.generateTransaction( + updateStoppedApplicationMicroserviceStatus, + { priority: PRIORITY_BACKGROUND, label: 'stoppedAppStatus.application' } + ) + const _updateStoppedMicroserviceStatus = TransactionDecorator.generateTransaction( + updateStoppedMicroserviceStatus, + { priority: PRIORITY_BACKGROUND, label: 'stoppedAppStatus.microservice' } + ) // Handle microservices from deactivated applications await _updateStoppedApplicationMicroserviceStatus() diff --git a/src/jobs/ws-session-reconcile-job.js b/src/jobs/ws-session-reconcile-job.js index b019d6a7..c5a0a07a 100644 --- a/src/jobs/ws-session-reconcile-job.js +++ b/src/jobs/ws-session-reconcile-job.js @@ -10,6 +10,7 @@ const MicroserviceManager = require('../data/managers/microservice-manager') const ChangeTrackingService = require('../services/change-tracking-service') const FogManager = require('../data/managers/iofog-manager') const TransactionDecorator = require('../decorators/transaction-decorator') +const { PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') function getIntervalMs () { const seconds = process.env.WS_SESSION_RECONCILE_INTERVAL_SECONDS || @@ -31,7 +32,7 @@ async function run () { } } -async function reconcileStaleSessions () { +async function reconcileStaleSessionsInTransaction (transaction) { const wsServer = WebSocketServer.getInstance() const execSessionManager = wsServer.execSessionManager const logSessionManager = wsServer.logSessionManager @@ -45,110 +46,117 @@ async function reconcileStaleSessions () { let execCleaned = 0 let logCleaned = 0 - await TransactionDecorator.generateTransaction(async (transaction) => { - const execRows = await MicroserviceExecSessionManager.findAll({ - status: { [Op.in]: ['PENDING', 'ACTIVE'] } - }, transaction) - - for (const row of execRows) { - const sessionId = row.sessionId - const microserviceUuid = row.microserviceUuid - if (!sessionId || !microserviceUuid) continue - - if (execSessionManager.getExecSession(sessionId)) continue - - const age = now - new Date(row.updatedAt).getTime() - const threshold = row.status === 'PENDING' ? execPendingTimeout : execMaxDuration - if (age < threshold) continue - - await MicroserviceExecSessionManager.deleteBySessionId(sessionId, transaction) - - const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) - if (microservice) { - await ChangeTrackingService.update( - microservice.iofogUuid, - ChangeTrackingService.events.microserviceExecSessions, - transaction - ) - } - - execCleaned++ - logger.info('Reconciled stale exec session row:' + JSON.stringify({ - sessionId, - microserviceUuid, - status: row.status, - ageMs: age - })) + const execRows = await MicroserviceExecSessionManager.findAll({ + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) + + for (const row of execRows) { + const sessionId = row.sessionId + const microserviceUuid = row.microserviceUuid + if (!sessionId || !microserviceUuid) continue + + if (execSessionManager.getExecSession(sessionId)) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === 'PENDING' ? execPendingTimeout : execMaxDuration + if (age < threshold) continue + + await MicroserviceExecSessionManager.deleteBySessionId(sessionId, transaction) + + const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceExecSessions, + transaction + ) } - const msLogRows = await MicroserviceLogStatusManager.findAll({ - status: { [Op.in]: ['PENDING', 'ACTIVE'] } - }, transaction) - - for (const row of msLogRows) { - if (logSessionManager.getLogSession(row.sessionId)) continue - - const age = now - new Date(row.updatedAt).getTime() - const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout - if (age < threshold) continue - - await MicroserviceLogStatusManager.delete({ sessionId: row.sessionId }, transaction) - logCleaned++ - - const microservice = await MicroserviceManager.findOne({ uuid: row.microserviceUuid }, transaction) - if (microservice) { - await ChangeTrackingService.update( - microservice.iofogUuid, - ChangeTrackingService.events.microserviceLogs, - transaction - ) - } - - logger.info('Reconciled stale microservice log row:' + JSON.stringify({ - sessionId: row.sessionId, - microserviceUuid: row.microserviceUuid, - status: row.status, - ageMs: age - })) + execCleaned++ + logger.info('Reconciled stale exec session row:' + JSON.stringify({ + sessionId, + microserviceUuid, + status: row.status, + ageMs: age + })) + } + + const msLogRows = await MicroserviceLogStatusManager.findAll({ + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) + + for (const row of msLogRows) { + if (logSessionManager.getLogSession(row.sessionId)) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout + if (age < threshold) continue + + await MicroserviceLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const microservice = await MicroserviceManager.findOne({ uuid: row.microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceLogs, + transaction + ) } - const fogLogRows = await FogLogStatusManager.findAll({ - status: { [Op.in]: ['PENDING', 'ACTIVE'] } - }, transaction) - - for (const row of fogLogRows) { - if (logSessionManager.getLogSession(row.sessionId)) continue - - const age = now - new Date(row.updatedAt).getTime() - const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout - if (age < threshold) continue - - await FogLogStatusManager.delete({ sessionId: row.sessionId }, transaction) - logCleaned++ - - const fog = await FogManager.findOne({ uuid: row.iofogUuid }, transaction) - if (fog) { - await ChangeTrackingService.update( - fog.uuid, - ChangeTrackingService.events.fogLogs, - transaction - ) - } - - logger.info('Reconciled stale fog log row:' + JSON.stringify({ - sessionId: row.sessionId, - iofogUuid: row.iofogUuid, - status: row.status, - ageMs: age - })) + logger.info('Reconciled stale microservice log row:' + JSON.stringify({ + sessionId: row.sessionId, + microserviceUuid: row.microserviceUuid, + status: row.status, + ageMs: age + })) + } + + const fogLogRows = await FogLogStatusManager.findAll({ + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) + + for (const row of fogLogRows) { + if (logSessionManager.getLogSession(row.sessionId)) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout + if (age < threshold) continue + + await FogLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const fog = await FogManager.findOne({ uuid: row.iofogUuid }, transaction) + if (fog) { + await ChangeTrackingService.update( + fog.uuid, + ChangeTrackingService.events.fogLogs, + transaction + ) } - })() + + logger.info('Reconciled stale fog log row:' + JSON.stringify({ + sessionId: row.sessionId, + iofogUuid: row.iofogUuid, + status: row.status, + ageMs: age + })) + } if (execCleaned > 0 || logCleaned > 0) { logger.info(`WS session reconcile completed: ${execCleaned} exec, ${logCleaned} log rows cleaned`) } } +const _reconcileStaleSessions = TransactionDecorator.generateTransaction( + reconcileStaleSessionsInTransaction, + { priority: PRIORITY_BACKGROUND, label: 'ws.sessionReconcile' } +) + +async function reconcileStaleSessions () { + await _reconcileStaleSessions() +} + module.exports = { run } diff --git a/src/lib/rbac/middleware.js b/src/lib/rbac/middleware.js index e1cbf344..505b52a0 100644 --- a/src/lib/rbac/middleware.js +++ b/src/lib/rbac/middleware.js @@ -7,6 +7,7 @@ const config = require('../../config') const db = require('../../data/models') const { getOidcSettings } = require('../../config/oidc') const { PASSWORD_CHANGE_REQUIRED_CLAIM } = require('../../services/auth-token-service') +const { runInTransaction } = require('../../helpers/transaction-runner') const PASSWORD_CHANGE_ALLOWLIST = [ { method: 'GET', path: '/api/v3/user/profile' }, @@ -318,17 +319,16 @@ function requirePermission (resource, verb) { const finalVerb = verb || routeDef.verb const resourceName = routeDef.resourceName - // Get database transaction (create a fake transaction for read-only operations) - const transaction = { fakeTransaction: true } - - // Authorize - const authResult = await authorizer.authorize( - subjects, - routeDef.apiGroup || '', - finalResource, - finalVerb, - resourceName, - transaction + const authResult = await runInTransaction( + (transaction) => authorizer.authorize( + subjects, + routeDef.apiGroup || '', + finalResource, + finalVerb, + resourceName, + transaction + ), + { label: 'rbac-authorize' } ) if (!authResult.allowed) { @@ -410,17 +410,16 @@ async function authorizeWebSocket (req, token) { subjects: subjects }) - // Get database transaction - const transaction = { fakeTransaction: true } - - // Authorize - const authResult = await authorizer.authorize( - subjects, - routeDef.apiGroup || '', - routeDef.resource, - routeDef.verb, - routeDef.resourceName, - transaction + const authResult = await runInTransaction( + (transaction) => authorizer.authorize( + subjects, + routeDef.apiGroup || '', + routeDef.resource, + routeDef.verb, + routeDef.resourceName, + transaction + ), + { label: 'rbac-authorize-ws' } ) logger.debug(`WebSocket authorization result:`, { @@ -572,17 +571,16 @@ function protect (_roles) { return callback() } - // Get database transaction - const transaction = { fakeTransaction: true } - - // Authorize - const authResult = await authorizer.authorize( - subjects, - routeDef.apiGroup || '', - routeDef.resource, - routeDef.verb, - routeDef.resourceName, - transaction + const authResult = await runInTransaction( + (transaction) => authorizer.authorize( + subjects, + routeDef.apiGroup || '', + routeDef.resource, + routeDef.verb, + routeDef.resourceName, + transaction + ), + { label: 'rbac-protect' } ) if (!authResult.allowed) { diff --git a/src/middlewares/event-audit-middleware.js b/src/middlewares/event-audit-middleware.js index 4aee3f6f..e5494545 100644 --- a/src/middlewares/event-audit-middleware.js +++ b/src/middlewares/event-audit-middleware.js @@ -44,11 +44,11 @@ function eventAuditMiddleware (req, res, next) { // Fire and forget - never await EventService.createHttpEvent(req, res, startTime).catch(err => { // Silent error handling - never throw - logger.error('Event logging failed (non-blocking):', err) + logger.error({ err }, 'Event logging failed (non-blocking)') }) } catch (error) { // Catch any synchronous errors - logger.error('Event logging setup failed (non-blocking):', error) + logger.error({ err: error }, 'Event logging setup failed (non-blocking)') // Don't throw - request already completed } }) diff --git a/src/services/agent-service.js b/src/services/agent-service.js index 3a8ee3f0..e30a76ca 100644 --- a/src/services/agent-service.js +++ b/src/services/agent-service.js @@ -790,7 +790,7 @@ module.exports = { updateHalUsbInfo: TransactionDecorator.generateTransaction(updateHalUsbInfo), deleteNode: TransactionDecorator.generateTransaction(deleteNode), getAgentLinkedVolumeMounts: TransactionDecorator.generateTransaction(getAgentLinkedVolumeMounts), - getControllerCA: TransactionDecorator.generateTransaction(getControllerCA), + getControllerCA, getAgentLogSessions: TransactionDecorator.generateTransaction(getAgentLogSessions), getAgentExecSessions: TransactionDecorator.generateTransaction(getAgentExecSessions) } diff --git a/src/services/application-service.js b/src/services/application-service.js index 74f9af52..4a2af20e 100644 --- a/src/services/application-service.js +++ b/src/services/application-service.js @@ -446,14 +446,12 @@ async function _updateChangeTrackingsAndDeleteMicroservicesByApplicationId (cond } } -const bypassOptions = { bypassQueue: true } - module.exports = { - createApplicationEndPoint: TransactionDecorator.generateTransaction(createApplicationEndPoint, bypassOptions), - deleteApplicationEndPoint: TransactionDecorator.generateTransaction(deleteApplicationEndPoint, bypassOptions), - deleteSystemApplicationEndPoint: TransactionDecorator.generateTransaction(deleteSystemApplicationEndPoint, bypassOptions), - updateApplicationEndPoint: TransactionDecorator.generateTransaction(updateApplicationEndPoint, bypassOptions), - patchApplicationEndPoint: TransactionDecorator.generateTransaction(patchApplicationEndPoint, bypassOptions), + createApplicationEndPoint: TransactionDecorator.generateTransaction(createApplicationEndPoint), + deleteApplicationEndPoint: TransactionDecorator.generateTransaction(deleteApplicationEndPoint), + deleteSystemApplicationEndPoint: TransactionDecorator.generateTransaction(deleteSystemApplicationEndPoint), + updateApplicationEndPoint: TransactionDecorator.generateTransaction(updateApplicationEndPoint), + patchApplicationEndPoint: TransactionDecorator.generateTransaction(patchApplicationEndPoint), getUserApplicationsEndPoint: TransactionDecorator.generateTransaction(getUserApplicationsEndPoint), getSystemApplicationsEndPoint: TransactionDecorator.generateTransaction(getSystemApplicationsEndPoint), getAllApplicationsEndPoint: TransactionDecorator.generateTransaction(getAllApplicationsEndPoint), diff --git a/src/services/auth-bootstrap-service.js b/src/services/auth-bootstrap-service.js index 65345eac..fcf1f68f 100644 --- a/src/services/auth-bootstrap-service.js +++ b/src/services/auth-bootstrap-service.js @@ -8,6 +8,7 @@ const secretHelper = require('../helpers/secret-helper') const AuthPasswordService = require('./auth-password-service') const AuthPolicyService = require('./auth-policy-service') const AuthTokenService = require('./auth-token-service') +const { runInTransaction } = require('../helpers/transaction-runner') const { ADMIN_GROUP } = require('./auth-mfa-service') const SYSTEM_GROUPS = ['admin', 'sre', 'developer', 'viewer'] @@ -111,102 +112,88 @@ async function createBootstrapUser (normalizedUsername, plainPassword, transacti return user } -async function runBootstrap (outerTransaction) { - const transaction = outerTransaction || await db.sequelize.transaction() - const ownTransaction = !outerTransaction +async function runBootstrapInternal (transaction) { + await ensureSystemGroups(transaction) - try { - await ensureSystemGroups(transaction) + let meta = await db.AuthBootstrapMeta.findByPk(1, { + transaction, + lock: transaction.LOCK.UPDATE + }) + if (!meta) { + meta = await db.AuthBootstrapMeta.create({ id: 1 }, { transaction }) + } - let meta = await db.AuthBootstrapMeta.findByPk(1, { - transaction, - lock: transaction.LOCK.UPDATE - }) - if (!meta) { - meta = await db.AuthBootstrapMeta.create({ id: 1 }, { transaction }) - } + const existingBootstrap = await findBootstrapUser(transaction) + const { username, passwordRef, allowBootstrapLog } = getBootstrapConfig() - const existingBootstrap = await findBootstrapUser(transaction) - const { username, passwordRef, allowBootstrapLog } = getBootstrapConfig() - - if (!username || !passwordRef) { - if (existingBootstrap) { - logger.warn('Embedded auth bootstrap env missing; keeping existing bootstrap admin') - } else { - logger.warn('Embedded auth bootstrap skipped: OIDC_BOOTSTRAP_ADMIN_USERNAME and OIDC_BOOTSTRAP_ADMIN_PASSWORD are required for first boot') - } - if (ownTransaction) { - await transaction.commit() - } - return { skipped: true, reason: existingBootstrap ? 'env_missing_keep_existing' : 'missing_credentials' } + if (!username || !passwordRef) { + if (existingBootstrap) { + logger.warn('Embedded auth bootstrap env missing; keeping existing bootstrap admin') + } else { + logger.warn('Embedded auth bootstrap skipped: OIDC_BOOTSTRAP_ADMIN_USERNAME and OIDC_BOOTSTRAP_ADMIN_PASSWORD are required for first boot') } + return { skipped: true, reason: existingBootstrap ? 'env_missing_keep_existing' : 'missing_credentials' } + } - const plainPassword = await resolveBootstrapPassword(passwordRef) - if (!plainPassword) { - if (existingBootstrap) { - logger.warn('Embedded auth bootstrap password could not be resolved; keeping existing bootstrap admin') - } else { - logger.warn('Embedded auth bootstrap skipped: bootstrap admin password could not be resolved') - } - if (ownTransaction) { - await transaction.commit() - } - return { skipped: true, reason: existingBootstrap ? 'env_missing_keep_existing' : 'missing_credentials' } + const plainPassword = await resolveBootstrapPassword(passwordRef) + if (!plainPassword) { + if (existingBootstrap) { + logger.warn('Embedded auth bootstrap password could not be resolved; keeping existing bootstrap admin') + } else { + logger.warn('Embedded auth bootstrap skipped: bootstrap admin password could not be resolved') } + return { skipped: true, reason: existingBootstrap ? 'env_missing_keep_existing' : 'missing_credentials' } + } - const policy = await AuthPolicyService.getPolicy(transaction) - AuthPasswordService.validatePasswordComplexity(plainPassword, policy) - const normalizedUsername = normalizeBootstrapUsername(username) + const policy = await AuthPolicyService.getPolicy(transaction) + AuthPasswordService.validatePasswordComplexity(plainPassword, policy) + const normalizedUsername = normalizeBootstrapUsername(username) + + if (existingBootstrap) { + if (await bootstrapMatchesEnv(existingBootstrap, normalizedUsername, plainPassword)) { + await meta.update({ + completedAt: new Date(), + bootstrapAdminUserId: existingBootstrap.id + }, { transaction }) + return { skipped: true, reason: 'unchanged', userId: existingBootstrap.id, username: normalizedUsername } + } - if (existingBootstrap) { - if (await bootstrapMatchesEnv(existingBootstrap, normalizedUsername, plainPassword)) { - await meta.update({ - completedAt: new Date(), - bootstrapAdminUserId: existingBootstrap.id - }, { transaction }) - if (ownTransaction) { - await transaction.commit() - } - return { skipped: true, reason: 'unchanged', userId: existingBootstrap.id, username: normalizedUsername } - } - - logger.info(`Embedded auth bootstrap admin rotation: replacing ${existingBootstrap.email}`) - await hardDeleteBootstrapUser(existingBootstrap, transaction) - } else { - const conflictingUser = await db.AuthUser.findOne({ - where: { email: normalizedUsername, deletedAt: null }, - transaction - }) - if (conflictingUser) { - logger.warn(`Embedded auth bootstrap skipped: user ${normalizedUsername} already exists and is not bootstrap`) - await meta.update({ - completedAt: new Date(), - bootstrapAdminUserId: conflictingUser.id - }, { transaction }) - if (ownTransaction) { - await transaction.commit() - } - return { skipped: true, reason: 'user_exists', userId: conflictingUser.id } - } + logger.info(`Embedded auth bootstrap admin rotation: replacing ${existingBootstrap.email}`) + await hardDeleteBootstrapUser(existingBootstrap, transaction) + } else { + const conflictingUser = await db.AuthUser.findOne({ + where: { email: normalizedUsername, deletedAt: null }, + transaction + }) + if (conflictingUser) { + logger.warn(`Embedded auth bootstrap skipped: user ${normalizedUsername} already exists and is not bootstrap`) + await meta.update({ + completedAt: new Date(), + bootstrapAdminUserId: conflictingUser.id + }, { transaction }) + return { skipped: true, reason: 'user_exists', userId: conflictingUser.id } } + } - const user = await createBootstrapUser(normalizedUsername, plainPassword, transaction, allowBootstrapLog) + const user = await createBootstrapUser(normalizedUsername, plainPassword, transaction, allowBootstrapLog) - await meta.update({ - completedAt: new Date(), - bootstrapAdminUserId: user.id - }, { transaction }) + await meta.update({ + completedAt: new Date(), + bootstrapAdminUserId: user.id + }, { transaction }) - if (ownTransaction) { - await transaction.commit() - } - return { skipped: false, userId: user.id, username: normalizedUsername } - } catch (error) { - if (ownTransaction) { - await transaction.rollback() - } - throw error + return { skipped: false, userId: user.id, username: normalizedUsername } +} + +async function runBootstrap (outerTransaction) { + if (outerTransaction) { + return runBootstrapInternal(outerTransaction) } + + return runInTransaction( + (transaction) => runBootstrapInternal(transaction), + { label: 'auth.bootstrap' } + ) } module.exports = { diff --git a/src/services/auth-interaction-service.js b/src/services/auth-interaction-service.js index 82ff8263..2f6d9401 100644 --- a/src/services/auth-interaction-service.js +++ b/src/services/auth-interaction-service.js @@ -10,6 +10,7 @@ const AuthPasswordService = require('./auth-password-service') const AuthMfaService = require('./auth-mfa-service') const AuthUserService = require('./auth-user-service') const InteractionStateStore = require('./auth-interaction-state-store') +const { runInTransaction, PRIORITY_INTERACTIVE } = require('../helpers/transaction-runner') function ensureEmbeddedMode () { if (getAuthMode() !== 'embedded') { @@ -129,7 +130,7 @@ async function verifyLoginCredentials (credentials, transaction) { return authContext } -async function getStatus (uid, transaction) { +async function getStatus (uid) { ensureEmbeddedMode() await findInteraction(uid) @@ -138,32 +139,36 @@ async function getStatus (uid, transaction) { return { step: 'login' } } - const authContext = await loadAuthContextByUserId(state.userId, transaction) - if (!authContext) { - await clearInteractionState(uid) - throw new Errors.AuthenticationError('Interaction session not found or expired') - } + return runInTransaction(async (transaction) => { + const authContext = await loadAuthContextByUserId(state.userId, transaction) + if (!authContext) { + await clearInteractionState(uid) + throw new Errors.AuthenticationError('Interaction session not found or expired') + } - return { step: resolveNextStep(authContext, state) } + return { step: resolveNextStep(authContext, state) } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.status' }) } -async function submitLogin (uid, credentials, transaction) { +async function submitLogin (uid, credentials) { ensureEmbeddedMode() await findInteraction(uid) - const authContext = await verifyLoginCredentials(credentials, transaction) - const state = await setInteractionState(uid, { - userId: authContext.user.id, - mfaVerified: false, - enrollmentStarted: false, - enrollmentConfirmed: false, - passwordChanged: false - }) - - return { step: resolveNextStep(authContext, state) } + return runInTransaction(async (transaction) => { + const authContext = await verifyLoginCredentials(credentials, transaction) + const state = await setInteractionState(uid, { + userId: authContext.user.id, + mfaVerified: false, + enrollmentStarted: false, + enrollmentConfirmed: false, + passwordChanged: false + }) + + return { step: resolveNextStep(authContext, state) } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.login' }) } -async function submitMfa (uid, code, transaction) { +async function submitMfa (uid, code) { ensureEmbeddedMode() await findInteraction(uid) @@ -172,14 +177,16 @@ async function submitMfa (uid, code, transaction) { throw new Errors.InvalidCredentialsError() } - await AuthMfaService.verifyMfaCode(state.userId, code, transaction) - const nextState = await setInteractionState(uid, { mfaVerified: true }) - const authContext = await loadAuthContextByUserId(state.userId, transaction) + return runInTransaction(async (transaction) => { + await AuthMfaService.verifyMfaCode(state.userId, code, transaction) + const nextState = await setInteractionState(uid, { mfaVerified: true }) + const authContext = await loadAuthContextByUserId(state.userId, transaction) - return { step: resolveNextStep(authContext, nextState) } + return { step: resolveNextStep(authContext, nextState) } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.mfa' }) } -async function submitEnroll (uid, transaction) { +async function submitEnroll (uid) { ensureEmbeddedMode() await findInteraction(uid) @@ -188,17 +195,19 @@ async function submitEnroll (uid, transaction) { throw new Errors.InvalidCredentialsError() } - const enrollment = await AuthMfaService.enrollMfa(state.userId, transaction) - const nextState = await setInteractionState(uid, { enrollmentStarted: true }) + return runInTransaction(async (transaction) => { + const enrollment = await AuthMfaService.enrollMfa(state.userId, transaction) + const nextState = await setInteractionState(uid, { enrollmentStarted: true }) - return { - step: resolveNextStep(await loadAuthContextByUserId(state.userId, transaction), nextState), - secret: enrollment.secret, - otpauthUrl: enrollment.otpauthUrl - } + return { + step: resolveNextStep(await loadAuthContextByUserId(state.userId, transaction), nextState), + secret: enrollment.secret, + otpauthUrl: enrollment.otpauthUrl + } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.enroll' }) } -async function submitConfirmEnroll (uid, code, transaction) { +async function submitConfirmEnroll (uid, code) { ensureEmbeddedMode() await findInteraction(uid) @@ -207,20 +216,22 @@ async function submitConfirmEnroll (uid, code, transaction) { throw new Errors.InvalidCredentialsError() } - const result = await AuthMfaService.confirmMfa(state.userId, code, transaction) - const nextState = await setInteractionState(uid, { - enrollmentConfirmed: true, - mfaVerified: true - }) - const authContext = await loadAuthContextByUserId(state.userId, transaction) - - return { - step: resolveNextStep(authContext, nextState), - recoveryCodes: result.recoveryCodes - } + return runInTransaction(async (transaction) => { + const result = await AuthMfaService.confirmMfa(state.userId, code, transaction) + const nextState = await setInteractionState(uid, { + enrollmentConfirmed: true, + mfaVerified: true + }) + const authContext = await loadAuthContextByUserId(state.userId, transaction) + + return { + step: resolveNextStep(authContext, nextState), + recoveryCodes: result.recoveryCodes + } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.confirmEnroll' }) } -async function submitChangePassword (uid, credentials, transaction) { +async function submitChangePassword (uid, credentials) { ensureEmbeddedMode() await findInteraction(uid) @@ -229,28 +240,30 @@ async function submitChangePassword (uid, credentials, transaction) { throw new Errors.InvalidCredentialsError() } - const authContext = await loadAuthContextByUserId(state.userId, transaction) - if (!authContext) { - await clearInteractionState(uid) - throw new Errors.AuthenticationError('Interaction session not found or expired') - } + return runInTransaction(async (transaction) => { + const authContext = await loadAuthContextByUserId(state.userId, transaction) + if (!authContext) { + await clearInteractionState(uid) + throw new Errors.AuthenticationError('Interaction session not found or expired') + } - const step = resolveNextStep(authContext, state) - if (step !== 'change-password') { - throw new Errors.ValidationError(`Interaction step "${step}" is required before password change`) - } + const step = resolveNextStep(authContext, state) + if (step !== 'change-password') { + throw new Errors.ValidationError(`Interaction step "${step}" is required before password change`) + } - await AuthUserService.changePasswordWithCurrent( - state.userId, - credentials.currentPassword, - credentials.newPassword, - transaction - ) + await AuthUserService.changePasswordWithCurrent( + state.userId, + credentials.currentPassword, + credentials.newPassword, + transaction + ) - const nextState = await setInteractionState(uid, { passwordChanged: true }) - const updatedContext = await loadAuthContextByUserId(state.userId, transaction) + const nextState = await setInteractionState(uid, { passwordChanged: true }) + const updatedContext = await loadAuthContextByUserId(state.userId, transaction) - return { step: resolveNextStep(updatedContext, nextState) } + return { step: resolveNextStep(updatedContext, nextState) } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.changePassword' }) } async function buildConsentGrant (provider, interaction, accountId) { @@ -264,7 +277,7 @@ async function buildConsentGrant (provider, interaction, accountId) { return grantId } -async function complete (uid, req, res, transaction) { +async function complete (uid, req, res) { ensureEmbeddedMode() const interaction = await findInteraction(uid) @@ -273,16 +286,20 @@ async function complete (uid, req, res, transaction) { throw new Errors.InvalidCredentialsError() } - const authContext = await loadAuthContextByUserId(state.userId, transaction) - if (!authContext) { - await clearInteractionState(uid) - throw new Errors.AuthenticationError('Interaction session not found or expired') - } + const authContext = await runInTransaction(async (transaction) => { + const context = await loadAuthContextByUserId(state.userId, transaction) + if (!context) { + await clearInteractionState(uid) + throw new Errors.AuthenticationError('Interaction session not found or expired') + } - const step = resolveNextStep(authContext, state) - if (step !== 'complete') { - throw new Errors.ValidationError(`Interaction step "${step}" is required before completion`) - } + const step = resolveNextStep(context, state) + if (step !== 'complete') { + throw new Errors.ValidationError(`Interaction step "${step}" is required before completion`) + } + + return context + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.complete.validate' }) const provider = getProvider() const grantId = await buildConsentGrant(provider, interaction, state.userId) @@ -291,7 +308,10 @@ async function complete (uid, req, res, transaction) { consent: { grantId } }) - await AuthPolicyService.resetFailedLogin(authContext.user, transaction) + await runInTransaction(async (transaction) => { + await AuthPolicyService.resetFailedLogin(authContext.user, transaction) + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.interaction.complete.reset-login' }) + await clearInteractionState(uid) return { redirectTo, step: 'complete' } diff --git a/src/services/auth-oauth-service.js b/src/services/auth-oauth-service.js index 85570347..fbf72fec 100644 --- a/src/services/auth-oauth-service.js +++ b/src/services/auth-oauth-service.js @@ -18,6 +18,8 @@ const { } = require('../config/oidc') const { getPublicUrl, getConsoleUrl } = require('../config/auth-urls') const { getSessionStoreTtlMs } = require('../config/auth-session-store') +const { withTransaction } = require('../helpers/app-helper') +const { runInTransaction, PRIORITY_INTERACTIVE } = require('../helpers/transaction-runner') const AuthTokenService = require('./auth-token-service') const OAUTH_SESSION_KEY = 'controllerOauth' @@ -93,7 +95,7 @@ function linkExternalUserByEmail (tokenResponse) { return email } -async function resolveEmbeddedUserFromTokenResponse (tokenResponse) { +async function resolveEmbeddedUserFromTokenResponse (tokenResponse, transaction) { if (!tokenResponse.id_token) { throw new Errors.AuthenticationError('OAuth response missing id_token') } @@ -104,13 +106,13 @@ async function resolveEmbeddedUserFromTokenResponse (tokenResponse) { throw new Errors.AuthenticationError('OAuth response missing subject') } - const user = await db.AuthUser.findByPk(userId, { + const user = await db.AuthUser.findByPk(userId, withTransaction(transaction, { include: [{ model: db.AuthGroup, as: 'groups', through: { attributes: [] } }] - }) + })) if (!user || user.deletedAt) { throw new Errors.AuthenticationError('OAuth user not found') @@ -173,13 +175,15 @@ async function callback (req) { const consoleUrl = getConsoleUrl() if (getAuthMode() === 'embedded') { - const user = await resolveEmbeddedUserFromTokenResponse(tokenResponse) - const groupNames = (user.groups || []).map((group) => group.name) - const tokens = await AuthTokenService.issueTokenPair(user, groupNames) - return { - tokens, - consoleUrl - } + return runInTransaction(async (transaction) => { + const user = await resolveEmbeddedUserFromTokenResponse(tokenResponse, transaction) + const groupNames = (user.groups || []).map((group) => group.name) + const tokens = await AuthTokenService.issueTokenPair(user, groupNames, transaction) + return { + tokens, + consoleUrl + } + }, { priority: PRIORITY_INTERACTIVE, label: 'auth.oauth.callback.embedded' }) } linkExternalUserByEmail(tokenResponse) diff --git a/src/services/controller-ms-service.js b/src/services/controller-ms-service.js index 9d2e92d5..af414d84 100644 --- a/src/services/controller-ms-service.js +++ b/src/services/controller-ms-service.js @@ -227,8 +227,11 @@ async function _updateImages (images, microserviceUuid, transaction) { await _createMicroserviceImages(microserviceUuid, images, transaction) } -async function _updatePorts (ports, microservice, transaction) { +async function _updatePorts (ports, microservice, fog, transaction) { await MicroservicePortService.deletePortMappings(microservice, transaction) + if (ports && ports.length) { + await MicroservicePortService.validatePortMappings({ ports, iofogUuid: fog.uuid }, transaction) + } for (const mapping of ports) { await MicroservicePortService.createPortMapping(microservice, mapping, transaction) } @@ -321,8 +324,7 @@ async function _updateControllerMicroservice (existing, registerData, fog, valid ) if (registerData.ports) { - await MicroservicePortService.validatePortMappings({ ports: registerData.ports, iofogUuid: fog.uuid }, transaction) - await _updatePorts(registerData.ports, updatedMicroservice, transaction) + await _updatePorts(registerData.ports, updatedMicroservice, fog, transaction) } if (registerData.volumeMappings) { @@ -382,8 +384,6 @@ async function registerControllerMicroservice (registerData, fog, transaction) { return { uuid: registerData.uuid } } -const bypassOptions = { bypassQueue: true } - module.exports = { - registerControllerMicroservice: TransactionDecorator.generateTransaction(registerControllerMicroservice, bypassOptions) + registerControllerMicroservice: TransactionDecorator.generateTransaction(registerControllerMicroservice) } diff --git a/src/services/event-service.js b/src/services/event-service.js index a6c31a1e..cdf26950 100644 --- a/src/services/event-service.js +++ b/src/services/event-service.js @@ -4,6 +4,7 @@ const logger = require('../logger') const Errors = require('../helpers/errors') const Validator = require('../schemas') const TransactionDecorator = require('../decorators/transaction-decorator') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') /** * Extract resource type from URL path @@ -342,6 +343,13 @@ async function createEvent (eventData, transaction) { return EventManager.create(eventRecord, transaction) } +async function persistAuditEvent (eventData) { + return runInTransaction( + (transaction) => createEvent(eventData, transaction), + { priority: PRIORITY_BACKGROUND, label: 'event.audit' } + ) +} + /** * Create event from HTTP request/response * @param {object} req - Express request object @@ -390,9 +398,8 @@ async function createHttpEvent (req, res, startTime) { requestId: req.id || null } - // Use fake transaction for non-blocking event creation - await createEvent(eventData, { fakeTransaction: true }).catch(err => { - logger.error('Event logging failed (non-blocking):', err) + await persistAuditEvent(eventData).catch(err => { + logger.error({ err }, 'Event logging failed (non-blocking)') }) } @@ -431,9 +438,8 @@ async function createWsConnectEvent (connectionData) { requestId: null } - // Use fake transaction for non-blocking event creation - await createEvent(eventData, { fakeTransaction: true }).catch(err => { - logger.error('WebSocket connect event logging failed (non-blocking):', err) + await persistAuditEvent(eventData).catch(err => { + logger.error({ err }, 'WebSocket connect event logging failed (non-blocking)') }) } @@ -473,9 +479,8 @@ async function createWsDisconnectEvent (connectionData) { requestId: null } - // Use fake transaction for non-blocking event creation - await createEvent(eventData, { fakeTransaction: true }).catch(err => { - logger.error('WebSocket disconnect event logging failed (non-blocking):', err) + await persistAuditEvent(eventData).catch(err => { + logger.error({ err }, 'WebSocket disconnect event logging failed (non-blocking)') }) } @@ -635,7 +640,7 @@ async function deleteEvents (params = {}, context = {}, transaction) { const endpointType = request.path && request.path.startsWith('/api/v3/agent/') ? 'agent' : 'user' const actorId = extractActorId(request) - await createEvent({ + await persistAuditEvent({ timestamp: Date.now(), eventType: 'HTTP', endpointType, @@ -649,11 +654,11 @@ async function deleteEvents (params = {}, context = {}, transaction) { statusCode: 200, statusMessage: days === 0 ? `Deleted all ${deletedCount} events` : `Deleted ${deletedCount} events older than ${days} days`, requestId: request.id || null - }, { fakeTransaction: true }).catch(err => { - logger.error('Failed to create DELETE events audit record (non-blocking):', err) + }).catch(err => { + logger.error({ err }, 'Failed to create DELETE events audit record (non-blocking)') }) } catch (error) { - logger.error('Error creating DELETE events audit record (non-blocking):', error) + logger.error({ err: error }, 'Error creating DELETE events audit record (non-blocking)') } }) diff --git a/src/services/microservice-ports/microservice-port.js b/src/services/microservice-ports/microservice-port.js index 9e541633..27ecedc6 100644 --- a/src/services/microservice-ports/microservice-port.js +++ b/src/services/microservice-ports/microservice-port.js @@ -14,9 +14,11 @@ async function _checkForDuplicatePorts (agent, localPort, transaction) { throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.PORT_RESERVED, localPort)) } - const microservices = await agent.getMicroservice() + // Read within the caller transaction so uncommitted deletes are visible (Plan 19). + const assocOptions = transaction != null ? { transaction } : undefined + const microservices = await agent.getMicroservice(assocOptions) for (const microservice of microservices) { - const ports = await microservice.getPorts() + const ports = await microservice.getPorts(assocOptions) if (ports.find(port => port.portExternal === localPort)) { throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.PORT_NOT_AVAILABLE, localPort)) } diff --git a/src/services/microservices-service.js b/src/services/microservices-service.js index db10325f..fa35524a 100644 --- a/src/services/microservices-service.js +++ b/src/services/microservices-service.js @@ -35,7 +35,6 @@ const SecretManager = require('../data/managers/secret-manager') const VolumeMountService = require('./volume-mount-service') const RbacServiceAccountManager = require('../data/managers/rbac-service-account-manager') const RbacRoleManager = require('../data/managers/rbac-role-manager') -const RbacCacheVersionManager = require('../data/managers/rbac-cache-version-manager') const NatsAuthService = require('./nats-auth-service') const NatsUserRuleManager = require('../data/managers/nats-user-rule-manager') const NatsRuleJwtValidation = require('../helpers/nats-rule-jwt-validation') @@ -67,28 +66,19 @@ async function _createOrUpdateServiceAccountForMicroservice (microserviceUuid, m throw new Errors.ValidationError(`Referenced role '${roleName}' does not exist`) } - const roleRef = { - kind: 'Role', - name: roleName - } - - const existingServiceAccount = await RbacServiceAccountManager.findOneByMicroserviceUuid(microserviceUuid, transaction) - - if (existingServiceAccount) { - await RbacServiceAccountManager.update({ id: existingServiceAccount.id }, { roleRef, name: microserviceName }, transaction) - await RbacCacheVersionManager.incrementVersion(transaction) - return RbacServiceAccountManager.findOne({ id: existingServiceAccount.id }, transaction) - } - const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) if (!microservice || microservice.applicationId == null) { throw new Errors.ValidationError('Microservice or application not found for service account creation') } + return RbacServiceAccountManager.createServiceAccount({ microserviceUuid, applicationId: microservice.applicationId, name: microserviceName, - roleRef + roleRef: { + kind: 'Role', + name: roleName + } }, transaction) } @@ -1717,7 +1707,7 @@ async function createPortMappingEndPoint (microserviceUuid, portMappingData, isC if (!agent) { throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, microservice.iofogUuid)) } - await MicroservicePortService.validatePortMapping(agent, portMappingData, {}, transaction) + await MicroservicePortService.validatePortMapping(agent, portMappingData, transaction) return MicroservicePortService.createPortMapping(microservice, portMappingData, transaction) } @@ -1738,7 +1728,7 @@ async function createSystemPortMappingEndPoint (microserviceUuid, portMappingDat if (!agent) { throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, microservice.iofogUuid)) } - await MicroservicePortService.validatePortMapping(agent, portMappingData, {}, transaction) + await MicroservicePortService.validatePortMapping(agent, portMappingData, transaction) return MicroservicePortService.createPortMapping(microservice, portMappingData, transaction) } @@ -2728,15 +2718,13 @@ async function reconcileNatsForApplication (applicationId, transaction) { } } -const bypassOptions = { bypassQueue: true } - module.exports = { - createMicroserviceEndPoint: TransactionDecorator.generateTransaction(createMicroserviceEndPoint, bypassOptions), + createMicroserviceEndPoint: TransactionDecorator.generateTransaction(createMicroserviceEndPoint), createPortMappingEndPoint: TransactionDecorator.generateTransaction(createPortMappingEndPoint), createSystemPortMappingEndPoint: TransactionDecorator.generateTransaction(createSystemPortMappingEndPoint), createVolumeMappingEndPoint: TransactionDecorator.generateTransaction(createVolumeMappingEndPoint), createSystemVolumeMappingEndPoint: TransactionDecorator.generateTransaction(createSystemVolumeMappingEndPoint), - deleteMicroserviceEndPoint: TransactionDecorator.generateTransaction(deleteMicroserviceEndPoint, bypassOptions), + deleteMicroserviceEndPoint: TransactionDecorator.generateTransaction(deleteMicroserviceEndPoint), deleteMicroserviceWithRoutesAndPortMappings, deleteNotRunningMicroservices, deletePortMappingEndPoint: TransactionDecorator.generateTransaction(deletePortMappingEndPoint), @@ -2751,8 +2739,8 @@ module.exports = { listMicroservicesEndPoint: TransactionDecorator.generateTransaction(listMicroservicesEndPoint), listSystemMicroservicesEndPoint: TransactionDecorator.generateTransaction(listSystemMicroservicesEndPoint), listVolumeMappingsEndPoint: TransactionDecorator.generateTransaction(listVolumeMappingsEndPoint), - updateMicroserviceEndPoint: TransactionDecorator.generateTransaction(updateMicroserviceEndPoint, bypassOptions), - updateSystemMicroserviceEndPoint: TransactionDecorator.generateTransaction(updateSystemMicroserviceEndPoint, bypassOptions), + updateMicroserviceEndPoint: TransactionDecorator.generateTransaction(updateMicroserviceEndPoint), + updateSystemMicroserviceEndPoint: TransactionDecorator.generateTransaction(updateSystemMicroserviceEndPoint), updateMicroserviceConfigEndPoint: TransactionDecorator.generateTransaction(updateMicroserviceConfigEndPoint), getMicroserviceConfigEndPoint: TransactionDecorator.generateTransaction(getMicroserviceConfigEndPoint), getSystemMicroserviceConfigEndPoint: TransactionDecorator.generateTransaction(getSystemMicroserviceConfigEndPoint), @@ -2765,7 +2753,7 @@ module.exports = { updateChangeTracking: _updateChangeTracking, startMicroserviceEndPoint: TransactionDecorator.generateTransaction(startMicroserviceEndPoint), stopMicroserviceEndPoint: TransactionDecorator.generateTransaction(stopMicroserviceEndPoint), - reconcileNatsForApplication: TransactionDecorator.generateTransaction(reconcileNatsForApplication, bypassOptions), + reconcileNatsForApplication: TransactionDecorator.generateTransaction(reconcileNatsForApplication), injectServiceAccountVolume: _injectServiceAccountVolume, stripUserServiceAccountVolumeMappings: _stripUserServiceAccountVolumeMappings, createOrUpdateServiceAccountForMicroservice: _createOrUpdateServiceAccountForMicroservice diff --git a/src/services/nats-relay-connection-manager.js b/src/services/nats-relay-connection-manager.js index a85a0d25..2b2d139c 100644 --- a/src/services/nats-relay-connection-manager.js +++ b/src/services/nats-relay-connection-manager.js @@ -8,6 +8,7 @@ const NatsAccountManager = require('../data/managers/nats-account-manager') const NatsUserManager = require('../data/managers/nats-user-manager') const NatsAuthService = require('./nats-auth-service') const SecretService = require('./secret-service') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const NATS_DEFAULT_PORT = 4222 @@ -16,7 +17,6 @@ class NatsRelayConnectionManager { this._connectFn = deps.connectFn || connect this._config = deps.config || config this.maxReconnectAttempts = deps.maxReconnectAttempts ?? -1 - this.fakeTransaction = { fakeTransaction: true } this.connection = null this.connectionPromise = null this.cachedHubRecord = null @@ -210,7 +210,10 @@ class NatsRelayConnectionManager { if (this.cachedHubRecord) { return this.cachedHubRecord } - const hub = await NatsInstanceManager.findOne({ isHub: true }, this.fakeTransaction) + const hub = await runInTransaction( + (transaction) => NatsInstanceManager.findOne({ isHub: true }, transaction), + { priority: PRIORITY_BACKGROUND, label: 'nats-relay-hub-record' } + ) if (!hub) { throw new Error('NATS hub not found. Ensure a hub NatsInstances row with isHub=true exists.') } @@ -240,53 +243,59 @@ class NatsRelayConnectionManager { } async _ensureControllerNatsAccount () { - const hub = await NatsInstanceManager.findOne({ isHub: true }, this.fakeTransaction) - if (!hub) { - return - } - await NatsAuthService.ensureControllerNatsAccount() + await runInTransaction(async (transaction) => { + const hub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + if (!hub) { + return + } + await NatsAuthService.ensureControllerNatsAccount(transaction, { triggerReconcile: false }) + }, { priority: PRIORITY_BACKGROUND, label: 'nats-relay-ensure-controller-account' }) } async _fetchControllerRelayCreds () { - const account = await NatsAccountManager.findOne({ - name: NatsAuthService.CONTROLLER_NATS_ACCOUNT_NAME, - applicationId: null, - isSystem: false, - isLeafSystem: false - }, this.fakeTransaction) - - let credsSecretName = NatsAuthService.controllerNatsCredsSecretName() - if (account) { - const user = await NatsUserManager.findOne({ - accountId: account.id, - name: NatsAuthService.CONTROLLER_NATS_USER_NAME - }, this.fakeTransaction) - if (user && user.credsSecretName) { - credsSecretName = user.credsSecretName + return runInTransaction(async (transaction) => { + const foundAccount = await NatsAccountManager.findOne({ + name: NatsAuthService.CONTROLLER_NATS_ACCOUNT_NAME, + applicationId: null, + isSystem: false, + isLeafSystem: false + }, transaction) + + let foundUser = null + if (foundAccount) { + foundUser = await NatsUserManager.findOne({ + accountId: foundAccount.id, + name: NatsAuthService.CONTROLLER_NATS_USER_NAME + }, transaction) } - } - const secret = await this._safeGetSecret(credsSecretName) - if (!secret || !secret.data) { - throw new Error(`Controller relay NATS creds secret not found: ${credsSecretName}`) - } + let credsSecretName = NatsAuthService.controllerNatsCredsSecretName() + if (foundUser && foundUser.credsSecretName) { + credsSecretName = foundUser.credsSecretName + } - const credsKey = Object.keys(secret.data).find((key) => key.endsWith('.creds')) || 'creds' - const raw = secret.data[credsKey] - if (!raw) { - throw new Error(`Missing creds payload in secret ${credsSecretName}`) - } + const secret = await this._safeGetSecret(credsSecretName, transaction) + if (!secret || !secret.data) { + throw new Error(`Controller relay NATS creds secret not found: ${credsSecretName}`) + } + + const credsKey = Object.keys(secret.data).find((key) => key.endsWith('.creds')) || 'creds' + const raw = secret.data[credsKey] + if (!raw) { + throw new Error(`Missing creds payload in secret ${credsSecretName}`) + } - const credsText = typeof raw === 'string' - ? raw - : Buffer.from(raw, 'base64').toString('utf8') + const credsText = typeof raw === 'string' + ? raw + : Buffer.from(raw, 'base64').toString('utf8') - return new TextEncoder().encode(credsText) + return new TextEncoder().encode(credsText) + }, { priority: PRIORITY_BACKGROUND, label: 'nats-relay-fetch-creds-db' }) } - async _safeGetSecret (name) { + async _safeGetSecret (name, transaction) { try { - return await SecretService.getSecretEndpoint(name) + return await SecretService.getSecretEndpoint(name, transaction) } catch (error) { if (error.name === 'NotFoundError') { logger.debug({ secret: name }, '[NATS][RELAY] Secret not found') diff --git a/src/services/rbac-service.js b/src/services/rbac-service.js index f6c94413..5263eb63 100644 --- a/src/services/rbac-service.js +++ b/src/services/rbac-service.js @@ -87,6 +87,19 @@ async function createRoleEndpoint (roleData, transaction) { } } +/** + * Build roleRef with the canonical role name after a role update (handles rename). + * @param {Object|null|undefined} existingRef - Existing roleRef from binding or service account + * @param {string} updatedRoleName - Canonical role name post-update + * @returns {Object} roleRef object + */ +function refreshedRoleRef (existingRef, updatedRoleName) { + return { + kind: (existingRef && existingRef.kind) || 'Role', + name: updatedRoleName + } +} + async function updateRoleEndpoint (name, roleData, transaction) { // Validate schema await Validator.validate(roleData, Validator.schemas.roleUpdate) @@ -123,9 +136,9 @@ async function updateRoleEndpoint (name, roleData, transaction) { // Find all role bindings that reference this role using roleId for efficient querying const bindings = await RbacRoleBindingManager.findAll({ roleId }, transaction) for (const binding of bindings) { - // Trigger update to refresh cache and ensure roleId is set + // Trigger update to refresh cache, roleId, and roleRef.name (including on rename) await RbacRoleBindingManager.updateRoleBinding(binding.name, { - roleRef: binding.roleRef + roleRef: refreshedRoleRef(binding.roleRef, updatedRoleName) }, transaction) } @@ -136,7 +149,7 @@ async function updateRoleEndpoint (name, roleData, transaction) { const appName = application ? application.name : null if (appName) { await RbacServiceAccountManager.updateServiceAccount(appName, sa.name, { - roleRef: sa.roleRef + roleRef: refreshedRoleRef(sa.roleRef, updatedRoleName) }, transaction) } } diff --git a/src/services/router-connection-manager.js b/src/services/router-connection-manager.js index 22d7aaf6..edff2696 100644 --- a/src/services/router-connection-manager.js +++ b/src/services/router-connection-manager.js @@ -7,6 +7,7 @@ const RouterManager = require('../data/managers/router-manager') const CertificateService = require('./certificate-service') const SecretService = require('./secret-service') const os = require('os') +const { runInTransaction } = require('../helpers/transaction-runner') const CONTROLLER_CERT_PREFIX = 'controller-exec-session-client' const hostname = process.env.HOSTNAME || os.hostname() @@ -59,7 +60,6 @@ class RouterConnectionManager { this.certificatePromise = null this.cachedCertificate = null this.cachedRouterRecord = null - this.fakeTransaction = { fakeTransaction: true } this.slots = Array.from({ length: this.poolSize }, (_, slotId) => new PoolSlot(this, slotId)) this.recoveryListeners = [] this.saturationCount = 0 @@ -502,7 +502,10 @@ class RouterConnectionManager { if (this.cachedRouterRecord) { return this.cachedRouterRecord } - const router = await RouterManager.findOne({ isDefault: true }, this.fakeTransaction) + const router = await runInTransaction( + (transaction) => RouterManager.findOne({ isDefault: true }, transaction), + { label: 'router-connection-default-router' } + ) if (!router) { throw new Error('Default router not found. Please ensure default router is provisioned.') } @@ -536,42 +539,46 @@ class RouterConnectionManager { async _createControllerCertificate () { logger.debug('[AMQP] Ensuring controller certificate secret exists', { name: CONTROLLER_CERT_NAME }) - await CertificateService.ensureRouterLocalCA(this.fakeTransaction) - const existingSecret = await this._safeGetSecret(CONTROLLER_CERT_NAME) const caName = Constants.DEFAULT_ROUTER_LOCAL_CA - if (existingSecret) { - const caSecret = await this._safeGetSecret(caName) - const bundle = this._decodeCertificate(existingSecret, caSecret) - logger.debug({ msg: '[AMQP] Using existing controller-exec-session-client certificate', ca: caName }) - return bundle - } - const hosts = this._buildControllerHosts() - logger.debug({ msg: '[AMQP] Generating controller-exec-session-client certificate', hosts, ca: caName }) + return runInTransaction(async (transaction) => { + await CertificateService.ensureRouterLocalCA(transaction) - try { - await CertificateService.createCertificateEndpoint({ - name: CONTROLLER_CERT_NAME, - subject: CONTROLLER_CERT_NAME, - hosts: hosts.join(','), - ca: { - type: 'direct', - secretName: caName - }, - expiration: 36 - }) - } catch (error) { - logger.error({ err: error, ca: caName, msg: '[AMQP] Failed to create controller certificate' }) - throw error - } + const existingSecret = await this._safeGetSecret(CONTROLLER_CERT_NAME, transaction) + if (existingSecret) { + const caSecret = await this._safeGetSecret(caName, transaction) + const bundle = this._decodeCertificate(existingSecret, caSecret) + logger.debug({ msg: '[AMQP] Using existing controller-exec-session-client certificate', ca: caName }) + return bundle + } - const certSecret = await this._safeGetSecret(CONTROLLER_CERT_NAME) - const caSecret = await this._safeGetSecret(caName) - if (!certSecret || !caSecret) { - throw new Error('Controller certificate creation succeeded but secret not found') - } - logger.debug({ msg: '[AMQP] controller-exec-session-client certificate generated successfully', ca: caName }) - return this._decodeCertificate(certSecret, caSecret) + const hosts = this._buildControllerHosts() + logger.debug({ msg: '[AMQP] Generating controller-exec-session-client certificate', hosts, ca: caName }) + + try { + await CertificateService.createCertificateEndpoint({ + name: CONTROLLER_CERT_NAME, + subject: CONTROLLER_CERT_NAME, + hosts: hosts.join(','), + ca: { + type: 'direct', + secretName: caName + }, + expiration: 36 + }, transaction) + } catch (error) { + logger.error({ err: error, ca: caName, msg: '[AMQP] Failed to create controller certificate' }) + throw error + } + + const certSecret = await this._safeGetSecret(CONTROLLER_CERT_NAME, transaction) + const caSecret = await this._safeGetSecret(caName, transaction) + if (!certSecret || !caSecret) { + throw new Error('Controller certificate creation succeeded but secret not found') + } + logger.debug({ msg: '[AMQP] controller-exec-session-client certificate generated successfully', ca: caName }) + return this._decodeCertificate(certSecret, caSecret) + }, { label: 'router-connection-controller-cert' }) } _buildControllerHosts () { @@ -604,9 +611,9 @@ class RouterConnectionManager { } } - async _safeGetSecret (name) { + async _safeGetSecret (name, transaction) { try { - return await SecretService.getSecretEndpoint(name) + return await SecretService.getSecretEndpoint(name, transaction) } catch (error) { if (error.name === 'NotFoundError') { logger.debug('[AMQP] Secret not found', { secret: name }) diff --git a/src/services/user-service.js b/src/services/user-service.js index d08f69b9..11a76ec8 100644 --- a/src/services/user-service.js +++ b/src/services/user-service.js @@ -38,64 +38,87 @@ function ensureEmbeddedMode () { } } -const login = async function (credentials, isCLI, transaction) { +const loginEmbedded = TransactionDecorator.generateTransaction(async function (credentials, isCLI, transaction) { + return AuthLoginService.login(credentials, transaction) +}) + +async function loginExternal (credentials) { + const oidcConfig = await getOidcConfiguration() + const parameters = { + username: credentials.email, + password: credentials.password + } + if (credentials.totp) { + parameters.totp = credentials.totp + } + + const tokenResponse = await genericGrantRequest(oidcConfig, 'password', parameters) + return tokensFromResponse(tokenResponse) +} + +const login = async function (credentials, isCLI) { ensureAuthConfigured() if (getAuthMode() === 'embedded') { - return AuthLoginService.login(credentials, transaction) + return loginEmbedded(credentials, isCLI) } try { - const oidcConfig = await getOidcConfiguration() - const parameters = { - username: credentials.email, - password: credentials.password - } - if (credentials.totp) { - parameters.totp = credentials.totp - } - - const tokenResponse = await genericGrantRequest(oidcConfig, 'password', parameters) - return tokensFromResponse(tokenResponse) + return await loginExternal(credentials) } catch (error) { mapOidcError(error) } } -const refresh = async function (credentials, isCLI, transaction) { +const refreshEmbedded = TransactionDecorator.generateTransaction(async function (credentials, isCLI, transaction) { + return AuthLoginService.refresh(credentials, transaction) +}) + +async function refreshExternal (credentials) { + const oidcConfig = await getOidcConfiguration() + const tokenResponse = await refreshTokenGrant(oidcConfig, credentials.refreshToken) + return tokensFromResponse(tokenResponse) +} + +const refresh = async function (credentials, isCLI) { ensureAuthConfigured() if (getAuthMode() === 'embedded') { - return AuthLoginService.refresh(credentials, transaction) + return refreshEmbedded(credentials, isCLI) } try { - const oidcConfig = await getOidcConfiguration() - const tokenResponse = await refreshTokenGrant(oidcConfig, credentials.refreshToken) - return tokensFromResponse(tokenResponse) + return await refreshExternal(credentials) } catch (error) { mapOidcError(error) } } -const profile = async function (req, isCLI, transaction) { +const profileEmbedded = TransactionDecorator.generateTransaction(async function (req, isCLI, transaction) { + return AuthLoginService.profile(req, transaction) +}) + +async function profileExternal (req) { + const accessToken = req.headers.authorization.replace('Bearer ', '') + const oidcConfig = await getOidcConfiguration() + const claims = decodeJwt(accessToken) + const subject = claims.sub + if (!subject) { + throw new Errors.InvalidCredentialsError('Invalid credentials') + } + + return fetchUserInfo(oidcConfig, accessToken, subject) +} + +const profile = async function (req, isCLI) { ensureAuthConfigured() if (getAuthMode() === 'embedded') { - return AuthLoginService.profile(req, transaction) + return profileEmbedded(req, isCLI) } - const accessToken = req.headers.authorization.replace('Bearer ', '') - try { - const oidcConfig = await getOidcConfiguration() - const claims = decodeJwt(accessToken) - const subject = claims.sub - if (!subject) { - throw new Errors.InvalidCredentialsError('Invalid credentials') - } - - return await fetchUserInfo(oidcConfig, accessToken, subject) + return await profileExternal(req) } catch (error) { if (error instanceof Errors.InvalidCredentialsError) { throw error @@ -104,13 +127,11 @@ const profile = async function (req, isCLI, transaction) { } } -const logout = async function (req, isCLI, transaction) { - ensureAuthConfigured() - - if (getAuthMode() === 'embedded') { - return AuthLoginService.logout(req, transaction) - } +const logoutEmbedded = TransactionDecorator.generateTransaction(async function (req, isCLI, transaction) { + return AuthLoginService.logout(req, transaction) +}) +async function logoutExternal (req) { const accessToken = req.headers.authorization.replace('Bearer ', '') try { @@ -126,7 +147,17 @@ const logout = async function (req, isCLI, transaction) { return { status: 'success' } } -const enrollMfa = async function (req, isCLI, transaction) { +const logout = async function (req, isCLI) { + ensureAuthConfigured() + + if (getAuthMode() === 'embedded') { + return logoutEmbedded(req, isCLI) + } + + return logoutExternal(req) +} + +const enrollMfa = TransactionDecorator.generateTransaction(async function (req, isCLI, transaction) { ensureAuthConfigured() ensureEmbeddedMode() @@ -136,9 +167,9 @@ const enrollMfa = async function (req, isCLI, transaction) { const userId = req.kauth.grant.access_token.content.sub return AuthMfaService.enrollMfa(userId, transaction) -} +}) -const confirmMfa = async function (req, isCLI, transaction) { +const confirmMfa = TransactionDecorator.generateTransaction(async function (req, isCLI, transaction) { ensureAuthConfigured() ensureEmbeddedMode() @@ -148,17 +179,17 @@ const confirmMfa = async function (req, isCLI, transaction) { const userId = req.kauth.grant.access_token.content.sub return AuthMfaService.confirmMfa(userId, req.body.code, transaction) -} +}) -const disableMfa = async function (req, isCLI, transaction) { +const disableMfa = TransactionDecorator.generateTransaction(async function (req, isCLI, transaction) { ensureAuthConfigured() ensureEmbeddedMode() const userId = req.kauth.grant.access_token.content.sub return AuthMfaService.disableMfa(userId, req.body.password, req.body.code, transaction) -} +}) -const changePassword = async function (req, payload, isCLI, transaction) { +const changePassword = TransactionDecorator.generateTransaction(async function (req, payload, isCLI, transaction) { ensureAuthConfigured() if (getAuthMode() === 'embedded') { @@ -170,76 +201,76 @@ const changePassword = async function (req, payload, isCLI, transaction) { } throw new Errors.NotImplementedError('Password change is only supported in embedded auth mode') -} +}) -const oauthAuthorize = async function (req, isCLI, transaction) { +const oauthAuthorize = async function (req, isCLI) { ensureAuthConfigured() return AuthOauthService.authorize(req) } -const oauthCallback = async function (req, isCLI, transaction) { +const oauthCallback = async function (req, isCLI) { ensureAuthConfigured() return AuthOauthService.callback(req) } -const interactionStatus = async function (uid, isCLI, transaction) { +const interactionStatus = async function (uid, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.getStatus(uid, transaction) + return AuthInteractionService.getStatus(uid) } -const interactionLogin = async function (uid, credentials, isCLI, transaction) { +const interactionLogin = async function (uid, credentials, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.submitLogin(uid, credentials, transaction) + return AuthInteractionService.submitLogin(uid, credentials) } -const interactionMfa = async function (uid, code, isCLI, transaction) { +const interactionMfa = async function (uid, code, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.submitMfa(uid, code, transaction) + return AuthInteractionService.submitMfa(uid, code) } -const interactionEnroll = async function (uid, isCLI, transaction) { +const interactionEnroll = async function (uid, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.submitEnroll(uid, transaction) + return AuthInteractionService.submitEnroll(uid) } -const interactionConfirmEnroll = async function (uid, code, isCLI, transaction) { +const interactionConfirmEnroll = async function (uid, code, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.submitConfirmEnroll(uid, code, transaction) + return AuthInteractionService.submitConfirmEnroll(uid, code) } -const interactionChangePassword = async function (uid, payload, isCLI, transaction) { +const interactionChangePassword = async function (uid, payload, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.submitChangePassword(uid, payload, transaction) + return AuthInteractionService.submitChangePassword(uid, payload) } -const interactionComplete = async function (uid, req, res, isCLI, transaction) { +const interactionComplete = async function (uid, req, res, isCLI) { ensureAuthConfigured() ensureEmbeddedMode() - return AuthInteractionService.complete(uid, req, res, transaction) + return AuthInteractionService.complete(uid, req, res) } module.exports = { - login: TransactionDecorator.generateTransaction(login), - refresh: TransactionDecorator.generateTransaction(refresh), - profile: TransactionDecorator.generateTransaction(profile), - logout: TransactionDecorator.generateTransaction(logout), - enrollMfa: TransactionDecorator.generateTransaction(enrollMfa), - confirmMfa: TransactionDecorator.generateTransaction(confirmMfa), - disableMfa: TransactionDecorator.generateTransaction(disableMfa), - changePassword: TransactionDecorator.generateTransaction(changePassword), - oauthAuthorize: TransactionDecorator.generateTransaction(oauthAuthorize), - oauthCallback: TransactionDecorator.generateTransaction(oauthCallback), - interactionStatus: TransactionDecorator.generateTransaction(interactionStatus), - interactionLogin: TransactionDecorator.generateTransaction(interactionLogin), - interactionMfa: TransactionDecorator.generateTransaction(interactionMfa), - interactionEnroll: TransactionDecorator.generateTransaction(interactionEnroll), - interactionConfirmEnroll: TransactionDecorator.generateTransaction(interactionConfirmEnroll), - interactionChangePassword: TransactionDecorator.generateTransaction(interactionChangePassword), - interactionComplete: TransactionDecorator.generateTransaction(interactionComplete) + login, + refresh, + profile, + logout, + enrollMfa, + confirmMfa, + disableMfa, + changePassword, + oauthAuthorize, + oauthCallback, + interactionStatus, + interactionLogin, + interactionMfa, + interactionEnroll, + interactionConfirmEnroll, + interactionChangePassword, + interactionComplete } diff --git a/src/services/volume-mount-service.js b/src/services/volume-mount-service.js index 2d2763de..72c225c5 100644 --- a/src/services/volume-mount-service.js +++ b/src/services/volume-mount-service.js @@ -18,7 +18,7 @@ async function findVolumeMountedFogNodes (volumeMountName, transaction) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.VOLUME_MOUNT_NOT_FOUND, volumeMountName)) } - const fogs = await volumeMount.getFogs({}, transaction) + const fogs = await volumeMount.getFogs({ transaction }) return fogs.map(fog => fog.uuid) } @@ -158,7 +158,7 @@ async function linkVolumeMountEndpoint (name, fogUuids, transaction) { if (!agent) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.NOT_FOUND_AGENT_NAME, fogUuid)) } - await agent.addVolumeMount(volumeMount.uuid, transaction) + await agent.addVolumeMount(volumeMount, { transaction }) } const newlyLinked = fogUuids.filter((uuid) => !alreadyLinked.has(uuid)) @@ -179,7 +179,7 @@ async function unlinkVolumeMountEndpoint (name, fogUuids, transaction) { if (!agent) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.NOT_FOUND_AGENT_NAME, fogUuid)) } - await agent.removeVolumeMount(volumeMount.uuid, transaction) + await agent.removeVolumeMount(volumeMount, { transaction }) } // Update change tracking for all unlinked fog nodes diff --git a/src/websocket/exec-session-manager.js b/src/websocket/exec-session-manager.js index 94d68dd0..c4c11a3c 100644 --- a/src/websocket/exec-session-manager.js +++ b/src/websocket/exec-session-manager.js @@ -1,5 +1,6 @@ const WebSocket = require('ws') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const MicroserviceExecSessionManager = require('../data/managers/microservice-exec-session-manager') const ChangeTrackingService = require('../services/change-tracking-service') const FogManager = require('../data/managers/iofog-manager') @@ -166,16 +167,9 @@ class ExecSessionManager { const interval = this.config.session.cleanupInterval || 30000 this.cleanupInterval = setInterval(async () => { try { - const models = require('../data/models') - const sequelize = models.sequelize - if (!sequelize) { - logger.warn('Sequelize not available, skipping exec session cleanup') - return - } - - await sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await this.cleanupExpiredSessions(transaction) - }) + }, { priority: PRIORITY_BACKGROUND, label: 'ws.execSessionCleanup' }) } catch (error) { logger.error('Error during exec session cleanup:' + JSON.stringify({ error: error.message, diff --git a/src/websocket/log-session-manager.js b/src/websocket/log-session-manager.js index a84b63ec..2f55d08a 100644 --- a/src/websocket/log-session-manager.js +++ b/src/websocket/log-session-manager.js @@ -1,5 +1,6 @@ const WebSocket = require('ws') const logger = require('../logger') +const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') const MicroserviceLogStatusManager = require('../data/managers/microservice-log-status-manager') const FogLogStatusManager = require('../data/managers/fog-log-status-manager') const ChangeTrackingService = require('../services/change-tracking-service') @@ -190,16 +191,9 @@ class LogSessionManager { const interval = this.config.session.cleanupInterval || 30000 // Default 30 seconds this.cleanupInterval = setInterval(async () => { try { - const models = require('../data/models') - const sequelize = models.sequelize - if (!sequelize) { - logger.warn('Sequelize not available, skipping log session cleanup') - return - } - - await sequelize.transaction(async (transaction) => { + await runInTransaction(async (transaction) => { await this.cleanupExpiredSessions(transaction) - }) + }, { priority: PRIORITY_BACKGROUND, label: 'ws.logSessionCleanup' }) } catch (error) { logger.error('Error during log session cleanup:' + JSON.stringify({ error: error.message, diff --git a/src/websocket/server.js b/src/websocket/server.js index 88754bf2..75f7af27 100644 --- a/src/websocket/server.js +++ b/src/websocket/server.js @@ -174,9 +174,7 @@ class WebSocketServer { if (!session || !session.user || !session.agent) return session.activationSent = false try { - await TransactionDecorator.generateTransaction(async (tx) => { - await this.sendExecActivationToExecSession(session, sessionId, tx) - })() + await this.sendExecActivationToExecSession(session, sessionId) } catch (error) { logger.error('[RELAY] Failed to resend exec activation after relay recovery', { sessionId, @@ -408,6 +406,31 @@ class WebSocketServer { return true } + _scheduleRelaySetupAfterCommit (label, setupFn) { + setImmediate(async () => { + try { + await setupFn() + } catch (error) { + logger.error(`Failed to ${label}:` + JSON.stringify({ + error: error.message, + stack: error.stack + })) + } + }) + } + + async _cleanupLogSessionInTransaction (sessionId) { + await TransactionDecorator.generateTransaction(async (transaction) => { + await this.cleanupLogSession(sessionId, transaction) + }, { label: 'ws.log.cleanup' })() + } + + async _cleanupExecSessionInTransaction (sessionId) { + await TransactionDecorator.generateTransaction(async (transaction) => { + await this.cleanupExecSession(sessionId, transaction) + }, { label: 'ws.exec.cleanup' })() + } + async countLogSessionsInDb (microserviceUuid, fogUuid, transaction) { if (microserviceUuid) { const rows = await MicroserviceLogStatusManager.findAll({ microserviceUuid }, transaction) @@ -899,7 +922,10 @@ class WebSocketServer { })) } - await this.setupExecMessageForwarding(sessionId, transaction) + this._scheduleRelaySetupAfterCommit( + 'setup exec message forwarding', + () => this.setupExecMessageForwarding(sessionId) + ) const EXEC_PENDING_TIMEOUT = this.getExecPendingTimeoutMs() const pendingTimer = setTimeout(async () => { @@ -1055,7 +1081,10 @@ class WebSocketServer { session.activationSent = false } - await this.setupExecMessageForwarding(sessionId, transaction) + this._scheduleRelaySetupAfterCommit( + 'setup exec message forwarding', + () => this.setupExecMessageForwarding(sessionId) + ) if (session.user && session.user.readyState === WebSocket.OPEN) { try { @@ -1224,7 +1253,7 @@ class WebSocketServer { // return noisePatterns.some(pattern => pattern.test(output)) // } - async sendExecActivationToExecSession (session, sessionId, transaction) { + async sendExecActivationToExecSession (session, sessionId) { if (!session.user || !session.agent) { return false } @@ -1261,7 +1290,7 @@ class WebSocketServer { microserviceUuid: session.microserviceUuid })) if (session.agent) { - await this.cleanupExecSession(sessionId, transaction) + await this._cleanupExecSessionInTransaction(sessionId) } } return success @@ -1271,7 +1300,7 @@ class WebSocketServer { error: error.message })) if (session.agent) { - await this.cleanupExecSession(sessionId, transaction) + await this._cleanupExecSessionInTransaction(sessionId) } return false } @@ -1926,8 +1955,11 @@ class WebSocketServer { })) } - // 9. Setup message forwarding (will be activated when agent connects) - await this.setupLogMessageForwarding(sessionId, transaction) + // 9. Relay setup after DB transaction commits (NATS hub lookup uses background writes). + this._scheduleRelaySetupAfterCommit( + 'setup log message forwarding', + () => this.setupLogMessageForwarding(sessionId) + ) // Pending timeout: close if agent does not connect within logPendingTimeoutMs const LOG_PENDING_TIMEOUT = this.getLogPendingTimeoutMs() @@ -2176,12 +2208,12 @@ class WebSocketServer { if (msg.type === MESSAGE_TYPES.LOG_LINE) { // Forward to user (one-to-one, like exec sessions) - await this.forwardLogToUser(sessionId, buffer, transaction) + await this.forwardLogToUser(sessionId, buffer) } else if (msg.type === MESSAGE_TYPES.LOG_START || msg.type === MESSAGE_TYPES.LOG_STOP || msg.type === MESSAGE_TYPES.LOG_ERROR) { // Handle control messages - await this.forwardLogToUser(sessionId, buffer, transaction) + await this.forwardLogToUser(sessionId, buffer) } }) @@ -2230,8 +2262,11 @@ class WebSocketServer { } } - // 8. Setup message forwarding (unidirectional: agent → user, one-to-one) - await this.setupLogMessageForwarding(sessionId, transaction) + // 8. Relay setup after DB transaction commits (NATS hub lookup uses background writes). + this._scheduleRelaySetupAfterCommit( + 'setup log message forwarding', + () => this.setupLogMessageForwarding(sessionId) + ) // 9. Record WebSocket connection event (non-blocking) setImmediate(async () => { @@ -2352,7 +2387,7 @@ class WebSocketServer { } } - async setupLogMessageForwarding (sessionId, transaction) { + async setupLogMessageForwarding (sessionId) { const session = this.logSessionManager.getLogSession(sessionId) if (!session) { logger.warn('setupLogMessageForwarding: Session not found:' + JSON.stringify({ sessionId })) @@ -2360,8 +2395,8 @@ class WebSocketServer { } // Enable queue bridge for cross-replica support (one-to-one, like exec sessions) - await this.relayTransport.enableForLogSession(session, (sessionId) => { - this.cleanupLogSession(sessionId, transaction) + await this.relayTransport.enableForLogSession(session, (closedSessionId) => { + this._cleanupLogSessionInTransaction(closedSessionId) }) // ONLY agent → user forwarding (unidirectional, one-to-one) @@ -2407,12 +2442,12 @@ class WebSocketServer { if (msg.type === MESSAGE_TYPES.LOG_LINE) { // Forward to user (one-to-one, like exec sessions) - await this.forwardLogToUser(sessionId, buffer, transaction) + await this.forwardLogToUser(sessionId, buffer) } else if (msg.type === MESSAGE_TYPES.LOG_START || msg.type === MESSAGE_TYPES.LOG_STOP || msg.type === MESSAGE_TYPES.LOG_ERROR) { // Handle control messages - await this.forwardLogToUser(sessionId, buffer, transaction) + await this.forwardLogToUser(sessionId, buffer) } }) } else { @@ -2456,7 +2491,7 @@ class WebSocketServer { return true } - async forwardLogToUser (sessionId, buffer, transaction) { + async forwardLogToUser (sessionId, buffer) { const session = this.logSessionManager.getLogSession(sessionId) if (!session) { logger.warn('forwardLogToUser: Session not found:' + JSON.stringify({ sessionId })) @@ -2523,7 +2558,7 @@ class WebSocketServer { await this.relayTransport.cleanupLogSession(sessionId) } - async setupExecMessageForwarding (sessionId, transaction) { + async setupExecMessageForwarding (sessionId) { const session = this.execSessionManager.getExecSession(sessionId) if (!session) { logger.warn('setupExecMessageForwarding: Session not found:' + JSON.stringify({ sessionId })) @@ -2541,7 +2576,7 @@ class WebSocketServer { clearTimeout(timeout) this.pendingCloseTimeouts.delete(closeExecId) } - await this.cleanupExecSession(closeExecId, transaction) + await this._cleanupExecSessionInTransaction(closeExecId) }) session.queueBridgeEnabled = true if (!wasQueueBridgeEnabled) { @@ -2567,7 +2602,7 @@ class WebSocketServer { if (session.agent && session.agent.readyState === WebSocket.OPEN) { session.agent.close(RELAY_UNAVAILABLE_CLOSE_CODE, RELAY_UNAVAILABLE_CLOSE_REASON) } - await this.cleanupExecSession(sessionId, transaction) + await this._cleanupExecSessionInTransaction(sessionId) return } logger.warn('[RELAY] Failed to enable relay bridge for exec session', { @@ -2578,7 +2613,7 @@ class WebSocketServer { } if (user && agent) { - const activated = await this.sendExecActivationToExecSession(session, sessionId, transaction) + const activated = await this.sendExecActivationToExecSession(session, sessionId) if (!activated) { logger.error('[RELAY] Exec session activation failed; aborting message forwarding setup', { sessionId, @@ -2610,7 +2645,7 @@ class WebSocketServer { const sent = await this.sendMessageToAgent(session.agent, msg, execId, session.microserviceUuid) if (!sent && this.relayTransport.shouldUseRelay(execId)) { logger.error('[RELAY] Exec relay publish failed; closing session', { sessionId: execId }) - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) } return } @@ -2633,7 +2668,7 @@ class WebSocketServer { if (currentSession && currentSession.user && currentSession.user.readyState === WebSocket.OPEN) { try { currentSession.user.close(1000, 'Session closed (timeout)') - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) } catch (error) { logger.error('[RELAY] Failed to close exec user socket on CLOSE timeout', { sessionId: execId, @@ -2650,7 +2685,7 @@ class WebSocketServer { if (user && user.readyState === WebSocket.OPEN) { user.close(1000, 'Session closed') } - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) return } @@ -2673,7 +2708,7 @@ class WebSocketServer { const sent = await this.sendMessageToAgent(session.agent, msg, execId, session.microserviceUuid) if (!sent && this.relayTransport.shouldUseRelay(execId)) { logger.error('[RELAY] Exec relay publish failed; closing session', { sessionId: execId }) - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) } } catch (error) { logger.error('[RELAY] Failed to process exec user message:' + JSON.stringify({ @@ -2709,7 +2744,7 @@ class WebSocketServer { } else if (session.user && session.user.readyState === WebSocket.OPEN) { session.user.close(1000, 'Agent closed connection') } - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) return } @@ -2722,7 +2757,7 @@ class WebSocketServer { sessionId: execId, error: error.message }) - await this.cleanupExecSession(execId, transaction) + await this._cleanupExecSessionInTransaction(execId) } } else if (session.user && session.user.readyState === WebSocket.OPEN) { if (msg.type === MESSAGE_TYPES.STDOUT || msg.type === MESSAGE_TYPES.STDERR) { From 19577728f9bf970e567b86d5237423dfd84472d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:40:49 +0300 Subject: [PATCH 09/32] Document unified database transaction model and update release notes. Add operator runbook for SQLite queue lanes, ReconcileOutbox, OTEL metrics, and architecture deployment profiles. --- CHANGELOG.md | 22 +++ docs/architecture.md | 62 +++++- docs/operations/database-transactions.md | 233 +++++++++++++++++++++++ test/vault/openbao.md | 3 + 4 files changed, 315 insertions(+), 5 deletions(-) create mode 100644 docs/operations/database-transactions.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bb86a1a..09a1b399 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,9 +111,23 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **WebSocket exec & log session hardening** — quotas (**3 exec** / 3 log WS per resource), per-session exec lifecycle, 60s/120s pending timeouts, 8h exec max, 30s graceful drain, OTEL metrics, HA AMQP fail-fast, integration tests, swagger WS protocol docs, operator guide (`docs/operations/ws-sessions.md`). - **Multi exec sessions** — `GET /api/v3/agent/exec/sessions`; agent exec WS `…/agent/exec/microservice/:uuid/:sessionId`; user ACTIVATION with `sessionId`; `MicroserviceExecSessions` table; `execMaxConcurrentPerResource` config (default **3**). - **WebSocket relay production** — unified **`WsRelayTransport`** abstraction; cross-replica exec/log relay backend selected at startup by **`nats.enabled`** (`NATS_ENABLED`): **AMQP** router pool (8 connections per replica, overflow recovery, sendable gating) when `false`, **NATS Core** pub/sub on platform hub (`controller-relay` account) when `true`. Fail-fast activation on both transports; log backpressure drops `LOG_LINE` under pressure. Config: `server.webSocket.relay.amqp.*`, `server.webSocket.relay.nats.*`. No new relay env var; HA swagger/docs updated per R112. +- **ransaction safety** — unified **`runInTransaction()`** write path for API, jobs, and WebSocket cleanup; **`fakeTransaction`** and **`bypassQueue`** removed; **`ReconcileOutbox`** transactional outbox with background drainer; SQLite priority write queue (`interactive` > `background`); mysql/postgres reconcile task claims use **`FOR UPDATE SKIP LOCKED`**; OTEL DB metrics and ops runbook (`docs/operations/database-transactions.md`). **Breaking: internal only** — no agent wire or public REST shape changes. +- **— pre-close transaction audit** — fixes SQLite hangs from nested `generateTransaction` (`certificate-service` → `SecretService` tx propagation), JTI cleanup job queue bypass, OAuth interaction OIDC reads outside tx, external-mode user IdP HTTP outside tx, service platform LoadBalancer watch outside long tx; extended grep gates and unit tests. threads optional `transaction` through `cert.js` `loadCA` / `getCAFromK8sSecret` / `getCAFromInput` so fog platform reconcile no longer deadlocks on SQLite when signing site-server certs after router-site-ca. NATS hub ConfigMap cluster routes, StatefulSet rollout, and JWT bundle K8s patches moved outside DB transaction bodies in `nats-service.js` (phased reconcile + `afterCommit` deferral when called from `reconcileFog`). HashiCorp Vault HTTP for secret/configmap/registry create/update/delete deferred via `transaction.afterCommit` (`vault-transaction-helper.js`); DB rows use internal encryption during tx, vault store/delete after commit. splits `FogPlatformService.reconcileFog` into phased background transactions (`prepare` → `certPrep` → NATS self-tx → `platform` → `finalize`) mirroring service-platform reconcile — no single tx spans cert generation, NATS, and router reconcile end-to-end. AMQP router cert provisioning in one transaction; agent CA endpoint without pointless DB tx; removed unused services-service TCP bridge K8s-in-tx helpers (operator CRUD uses enqueue + service-platform reconcile only); OIDC provider adapter routed through write queue. +- **unified ALS transaction context** — `runWithTransactionContext` registers existing Sequelize transactions in AsyncLocalStorage; `generateTransaction` uses it for explicit-tx and ALS-inject paths so nested `runInTransaction()` reuses the parent writer on SQLite instead of enqueueing a second transaction. +- **NATS phased reconcile** — `ensureNatsForFog` splits into `nats.ensure.certPrep` (certs + JetStream key), `nats.ensure.authPrep` (JWT bundle + sys-user creds), and `nats.ensure.topology` (instance, mounts, microservice) short background transactions; K8s hub routes remain post-tx. `reconcileFogNats` calls `ensureNatsForFogPhased` / `cleanupNatsForFogPhased` instead of one monolithic `fogPlatform.natsEnsure` tx. +- **enforcement grep gates** — extended `transaction-grep-gates.test.js` for managers-never-enqueue, cert utils branch, K8s-outside-tx (nats + service-platform), vault afterCommit, volume-mount association tx, phased fog platform labels, OIDC adapter queue routing, JTI cleanup via runner. +- **first-fog integration + close docs** — `test/src/integration/first-fog-reconcile-sqlite.test.js` (gate: `RUN_INTEGRATION=1`); ops/architecture docs for R126–R135; plan close checklist updated for integration + load probe. +- **unwrap internal `_`** — `iofog-service.js` internal helpers (`_handleRouterCertificates`, `_deleteFogRouter`, router/TCP/HAL/Bluetooth/NATS helpers) export plain functions; callers in `fog-platform-service` and sweep jobs pass explicit `transaction` from phased orchestrators. Public API entrypoints remain wrapped with `generateTransaction`. ### Fixed +- **Agent fog-token auth hang (SQLite)** — `checkFogToken` updated `lastActive` via `FogManager.updateLastActive` without passing the open Sequelize transaction on a single-connection pool (`pool.max: 1`), deadlocking the write queue after provision when Edgelet first called JWT-authenticated routes (`PATCH /agent/config`, `GET /agent/registries`, etc.). +- **WebSocket audit event logging (SQLite)** — `persistAuditEvent` (`PRIORITY_BACKGROUND`) reused a committed parent transaction from AsyncLocalStorage when `createWsConnectEvent` ran in `setImmediate` after the log-session handler committed, causing `commit has been called on this transaction` errors. Background `runInTransaction` on SQLite now always enqueues a fresh transaction. +- **WebSocket log/exec session deadlock (SQLite)** — log and exec handlers awaited NATS relay setup inside the open interactive transaction; relay hub lookup enqueues a background transaction on the single SQLite connection and deadlocked. Relay setup now runs in `setImmediate` after DB work commits; relay cleanup callbacks open fresh transactions instead of capturing the handler transaction. +- **Volume mount manager transaction propagation** — `VolumeMountingManager.findOne` / `findAll` passed `transaction` as a second Sequelize argument instead of inside the options object, so NATS fog reconcile could create a volume mount in an open transaction then fail to link it (`nats-server-conf-* not found`). Reads now honor the parent transaction like `BaseManager`. +- **Volume mount service transaction propagation** — `VolumeMountService.linkVolumeMountEndpoint` / `unlinkVolumeMountEndpoint` passed `transaction` as a second Sequelize argument to `getFogs` / `addVolumeMount` / `removeVolumeMount` instead of inside the options object, causing NATS fog reconcile to hang when linking volume mounts after auth bootstrap. +- **Fog platform reconcile stale errors** — `reconcileFogPrepare` clears `lastError` when entering `Progressing` so prior `SQLITE_BUSY` does not mask current reconcile state. +- **Fog platform NATS provisioning hang (SQLite)** — `reconcileFogNats` calls `ensureNatsForFogDb` directly in a background transaction (no `ensureNatsForFog` re-wrap). `generateTransaction` inlines the active AsyncLocalStorage transaction instead of enqueueing nested `runInTransaction`; duck-typed Sequelize transaction detection; `NatsConnectionManager.findAllWithNats` and `VolumeMappingManager.findAll` pass `transaction` inside Sequelize options. Fixes deadlock after router cert prep when provisioning hub NATS on first fog. - NATS relay and AMQP router connection resolvers — **Remote CP** uses Edgelet bridge DNS then DB host only (no `*.svc.cluster.local`); **Kubernetes CP** uses `nats-server.{namespace}.svc.cluster.local` / `router.{namespace}.svc.cluster.local` with DB host fallback; connect failures log and throw aggregate errors for all attempts; relay log messages are transport-aware. - NATS relay **`controller-relay` creds loading** — read Opaque secret values as plain UTF-8 `.creds` text (matches `nats-service.js` and DB storage); fixes **`unable to parse credentials`** on hub connect when `NATS_ENABLED=true`. - NATS platform relay identity renamed to account/user **`controller`** with rules **`controller-account`** / **`controller-user`**; **`GET /nats/accounts/controller/users/controller/creds`** supported for operator cred export. @@ -124,6 +138,14 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - Central local CAs (`default-router-local-ca`, `default-nats-local-ca`) are ensured on first agent provision (or via operator direct import before first agent), not at Controller boot — allows custom local CAs before agent deployment. - Fog teardown drops obsolete per-fog **`nats-local-ca-*`** and **`router-local-ca-*`** secret names from cleanup lists. - OIDC discovery with **`AUTH_INSECURE_ALLOW_HTTP`** uses the supported `openid-client` insecure-request hook for local **`http://`** issuers. +- **Postgres OAuth/session expiry (TIMESTAMPTZ)** — greenfield postgres migration uses **`TIMESTAMPTZ`** for all temporal columns; Sequelize **`timezone: '+00:00'`** on postgres/mysql providers so auth interaction, BFF session, and certificate/heartbeat date comparisons are UTC-correct regardless of host **`TZ`**. OIDC adapter upsert uses **`conflictFields: ['model', 'record_id']`** for postgres **`ON CONFLICT`**. +- **Cross-DB TEXT column defaults** — removed **`DEFAULT`** from **`TEXT`** columns (`Fogs.warning_message`, `RbacRoles.kind`, `RbacRoleBindings.kind`) in sqlite/mysql/postgres greenfield migrations; Sequelize model **`defaultValue`** applies on insert (fixes MySQL **`ER_BLOB_CANT_HAVE_DEFAULT`** in strict mode). +- **Postgres reconcile outbox enqueue** — `ReconcileOutboxManager.enqueue` uses find-first dedup (postgres aborts transactions on duplicate insert); processed rows with the same idempotency key are re-opened for drain; insert races use a savepoint on postgres. +- **Fog delete reconcile** — platform worker runs delete when status phase is **`Deleting`** even if task reason is still spec/manual-retry; **`reconcileFog` skip → delete** fallback; delete enqueue preempts **`in_progress`** tasks; delete failures keep phase **`Deleting`**; shorter delete-task staleness reclaim (default 60s). +- **Fog delete NATS cleanup (postgres)** — `cleanupNatsForFog` reuses the caller transaction when provided (fixes postgres hang/deadlock from nested tx); NATS cleanup runs before microservice deletes in `_processDeleteCommand`. +- **MySQL `MicroserviceHealthChecks.interval`** — quote reserved column name as **`` `interval` ``** in mysql migration (fixes **`ER_PARSE_ERROR`** on greenfield install). +- **MySQL `MicroserviceExecSessions.session_id`** — use **`VARCHAR(255) UNIQUE`** instead of **`TEXT UNIQUE`** (fixes ignored **`ER_BLOB_KEY_WITHOUT_LENGTH`** and subsequent **`ER_NO_SUCH_TABLE`** on index creation); model aligned to **`STRING(255)`**. +- **MySQL RBAC TEXT unique keys** — remove inline **`TEXT UNIQUE`** on **`RbacRoles.name`** / **`RbacRoleBindings.name`** (keep **`UNIQUE KEY … (name(255))`**); prefix **`RbacServiceAccounts`** composite unique index with **`name(255)`** (fixes greenfield **`ER_FK_CANNOT_OPEN_PARENT`** / blob-key errors). - Embedded OAuth BFF builds the in-process issuer client from local metadata and trusts listener TLS material (**`TLS_PATH_*`** / **`TLS_BASE64_*`**) for token exchange — fixes **`fetch failed`** on **`GET /api/v3/user/oauth/authorize`** with self-signed HTTPS certs without **`NODE_EXTRA_CA_CERTS`**. - Provisioning key and **`GET /api/v3/agent/cert`** derive **`caCert`** from listener TLS material (**`TLS_PATH_*`** / **`TLS_BASE64_*`**) via shared **`tls-config`** — always base64-encoded for Edgelet trust store; fixes empty **`caCert`** when legacy **`SSL_CERT`** / **`INTERMEDIATE_CERT`** were unset. - Config keys **`auth.bootstrap.adminUsername`** / **`adminPassword`** renamed to **`auth.bootstrap.username`** / **`password`** (**`OIDC_BOOTSTRAP_ADMIN_*`** env vars unchanged). diff --git a/docs/architecture.md b/docs/architecture.md index e19adfd3..7e9ce240 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -369,7 +369,52 @@ For the full bilateral contract (including ControlPlane env vars and verificatio | Topic | v3.8 behavior | |-------|---------------| -| **Database** | Greenfield v3.8.0 schema — **new install only** (no v3.7 migrator). Supports **sqlite** (single-controller production), **mysql**, and **postgres** (multi-replica / HA). | +| **Database** | Greenfield v3.8.0 schema — **new install only** (no v3.7 migrator). Supports **sqlite** (single-controller production), **mysql**, and **postgres** (multi-replica / HA). All mutating paths use **`runInTransaction()`** (Plan 19, R114–R125). Plan **19-I** stabilization (R126–R135): unified ALS transaction context, phased NATS reconcile, grep gates, first-fog integration SLO. | + +### Database profiles (Plan 19 / 19-I) + +| Profile | Database | Controller replicas | Typical fleet size | Notes | +|---------|----------|---------------------|-------------------|-------| +| **Edge / PoT** | sqlite | 1 | ≤ **50** fogs (default warning threshold) | Single write queue; embedded OIDC | +| **Small production** | sqlite | 1 | 50–100 fogs | Supported within single-writer physics; soft warning logged above threshold | +| **Enterprise / HA** | mysql or postgres | 1+ | **100+** fogs recommended | Default for large fleets; `FOR UPDATE SKIP LOCKED` task claims; shared OIDC session store | + +**Enterprise default:** mysql/postgres for fleets above **100** fogs or any multi-replica deployment. sqlite remains supported for single-node edge deployments within Plan 19 SLOs (200 fogs acceptance profile). + +```mermaid +flowchart LR + subgraph callers [Mutating callers] + API[REST / Agent API] + WS[WS session DB ops] + JOBS[Background jobs] + end + + subgraph runner [runInTransaction] + Q{provider?} + SQ[SQLite priority queue] + POOL[mysql/postgres pool] + TX[Real Sequelize transaction] + end + + subgraph outbox [ReconcileOutbox] + INS[Same-commit insert] + DRAIN[Outbox drainer] + end + + API --> runner + WS --> runner + JOBS --> runner + Q -->|sqlite| SQ --> TX + Q -->|mysql/pg| POOL --> TX + TX --> INS --> DRAIN +``` + +| Priority lane | Callers | +|---------------|---------| +| **interactive** | Agent routes, user RBAC API, WS session DB ops, OIDC/auth | +| **background** | Reconcile workers, outbox drainer, platform sweep, cleanup timers | + +Full operator runbook: [operations/database-transactions.md](operations/database-transactions.md). ### SQLite single-node production @@ -377,10 +422,16 @@ Small deployments with **one Controller process** may use SQLite as the producti | Topic | Behavior | |-------|----------| -| **When to use** | Single Controller, no DB HA requirement, edge/small-cluster PoT | -| **Concurrency** | WAL journal mode + `busy_timeout` pragmas on connect; connection pool size 1 | -| **Background jobs** | Reconcile-heavy jobs start after a configurable delay (`settings.jobStartupDelaySeconds`, default 3s) and stagger by 500ms to avoid restart lock bursts | -| **Task claims** | Fog/service/NATS reconcile task claims retry on `SQLITE_BUSY` (same retry budget as `TransactionDecorator`) | +| **When to use** | Single Controller, no DB HA requirement, edge/small-cluster PoT (≤ recommended fog count) | +| **Write path** | All mutations via `runInTransaction()` — **real** ACID transactions (no `fakeTransaction`); nested reuse via **`runWithTransactionContext`** ALS (R126–R128) | +| **Concurrency** | Global **priority write queue** (interactive before background); pool `max: 1`; WAL + `busy_timeout` pragmas | +| **First-fog SLO (R133)** | sqlite integration gate: first fog reconcile + concurrent operator login/list **< 2s**; `RUN_INTEGRATION=1 npm run test:integration:first-fog` | +| **Load close gate (R135)** | `node test/load/transaction-safety-load.js --fogs=50 --soak-minutes=5` — agent p99 < 200ms, operator p99 < 1s | +| **Busy retry** | Exponential backoff + jitter on `SQLITE_BUSY` inside queue task (configurable max attempts) | +| **Reconcile enqueue** | **`ReconcileOutbox`** — mutation + outbox row in same commit; drainer creates reconcile tasks | +| **Background jobs** | `priority: 'background'`; startup stagger (`settings.jobStartupDelaySeconds`, default 3s) + 500ms between jobs | +| **Task claims** | Same runner; busy retry on sqlite; mysql/postgres use `FOR UPDATE SKIP LOCKED` | +| **Load SLO** | 200 fogs / 40s poll / 10 operators / 30 min soak: agent poll p99 **< 200ms**; operator REST p99 **< 1s** | | **Persistence** | Mount a persistent volume for `controller_db.sqlite` and WAL sidecar files (`-wal`, `-shm`) | | **Backup** | Use SQLite backup API or copy DB + WAL files during a quiet window | | **HA path** | mysql/postgres + multiple Controller replicas — see [oidc-configuration.md](oidc-configuration.md) | @@ -415,4 +466,5 @@ Agent routes and WebSocket exec/logs for agents are **outside** OIDC — see [rb | [pki.md](pki.md) | Central CAs, cert renewal, NATS operator rotation | | [oidc-configuration.md](oidc-configuration.md) | Embedded/external auth modes and env vars | | [external-oidc-client-setup.md](external-oidc-client-setup.md) | External IdP client configuration | +| [operations/database-transactions.md](operations/database-transactions.md) | Transaction runner, OTEL metrics, SQLITE_BUSY runbook | | [CONTRIBUTING](../CONTRIBUTING) | Dual-mirror CI and development | diff --git a/docs/operations/database-transactions.md b/docs/operations/database-transactions.md new file mode 100644 index 00000000..38236e51 --- /dev/null +++ b/docs/operations/database-transactions.md @@ -0,0 +1,233 @@ +# Database transactions — operations guide + +## Overview + +Controller v3.8.0 routes **all mutating database work** through `runInTransaction()` in `src/helpers/transaction-runner.js`. Every commit uses a **real** Sequelize transaction — the legacy `fakeTransaction` workaround is removed. + +| Provider | Concurrency model | +|----------|-------------------| +| **sqlite** | Single connection (`pool.max: 1`); global **priority write queue** (interactive before background); WAL + `busy_timeout` | +| **mysql / postgres** | Connection pool (default max **10**); task claims use `FOR UPDATE SKIP LOCKED` | + +Reconcile work is scheduled via the **`ReconcileOutbox`** table — mutations and outbox inserts commit atomically; a background drainer creates reconcile task rows. + +--- + +## Architecture + +```mermaid +flowchart LR + subgraph callers [Mutating callers] + API[REST / agent API] + WS[WS session DB ops] + JOBS[Background jobs] + end + + subgraph runner [transaction-runner.js] + Q{provider?} + SQ[SQLite priority queue] + POOL[mysql/postgres pool tx] + RETRY[withDbBusyRetry] + end + + callers --> runner + Q -->|sqlite| SQ --> RETRY + Q -->|mysql/pg| POOL --> RETRY +``` + +**Priority lanes (sqlite only):** + +| Priority | Typical callers | +|----------|-----------------| +| **interactive** | Agent routes, RBAC API, WS session DB ops, OIDC/auth | +| **background** | Reconcile workers, outbox drainer, platform sweep, heartbeat, cleanup jobs | + +Interactive tasks are dequeued before background tasks. mysql/postgres skip the global queue and use the connection pool directly. + +### Unified transaction context + +Routed all writes through `runInTransaction()`, but **`generateTransaction` handlers that received an explicit Sequelize `transaction` argument did not register that transaction in AsyncLocalStorage (ALS)**. Any nested callee that called `runInTransaction()` without the parent tx could enqueue a **second** sqlite writer → deadlock (`SQLITE_BUSY`, API hang ~30–60s). + +| Mechanism | Role | +|-----------|------| +| **`runWithTransactionContext(transaction, priority, fn)`** | Runs `fn` with ALS set to the **existing** transaction — required whenever code already holds a Sequelize tx | +| **`generateTransaction`** | Uses `runWithTransactionContext` for explicit-tx args, ALS parent injection, and new top-level txs via `runInTransaction` | +| **`runInTransaction()` (no tx arg)** | Reuses parent tx from ALS when nested under any ancestor that used `runWithTransactionContext` (**interactive** priority only on SQLite) | +| **`runInTransaction({ priority: background })`** | Always enqueues a **fresh** SQLite transaction — never reuses ALS parent (avoids stale tx after handler commit + deferred audit) | + +**Rule for developers:** If your function runs inside an open transaction (API handler last arg, worker phase callback, etc.), nested work must either pass `transaction` through wrapped exports **or** rely on ALS via `runWithTransactionContext`. Do not call bare `runInTransaction()` from deep helpers expecting implicit join — the runner only reuses via ALS or an explicit tx parameter on `runInTransaction` itself. + +Phased reconcile and grep gates complement ALS: short txs for NATS/platform phases and CI checks for K8s/vault/I/O outside tx bodies. + +--- + +## When to use which database + +| Profile | Recommended DB | Notes | +|---------|----------------|-------| +| Single Controller, ≤ **50** fogs (default threshold) | **sqlite** | Edge / PoT; mount persistent volume for `.sqlite` + `-wal` / `-shm` | +| Single Controller, **50–100+** fogs | **mysql** or **postgres** | Controller logs soft warning above threshold on sqlite | +| Multi-replica HA | **mysql** or **postgres** | Embedded OIDC requires shared DB; sqlite **not** supported for multi-replica | +| Enterprise production | **mysql** or **postgres** | Documented default for large fleets and multi-user load | + +| Fleet size | sqlite | mysql/postgres | +|------------|--------|----------------| +| ≤ 50 fogs | Recommended | Optional | +| 51–100 fogs | Supported with warning | Recommended | +| 100+ fogs | Possible but not recommended | **Required** for enterprise SLOs | +| Multi-replica | Not supported | **Required** | + +--- + +## Configuration + +| Setting | Default | Env override | +|---------|---------|--------------| +| `settings.sqliteEnterpriseFogWarningThreshold` | 50 | `SQLITE_ENTERPRISE_FOG_WARNING_THRESHOLD` | +| `settings.dbWriteQueueMaxDepth` | 256 | `DB_WRITE_QUEUE_MAX_DEPTH` | +| `settings.dbBusyRetryMaxAttempts` | 8 | `DB_BUSY_RETRY_MAX_ATTEMPTS` | +| `settings.dbBusyRetryBaseMs` | 25 | `DB_BUSY_RETRY_BASE_MS` | +| `settings.reconcileOutboxDrainerIntervalSeconds` | 1 | `RECONCILE_OUTBOX_DRAINER_INTERVAL_SECONDS` | +| `database.mysql.pool.max` | 10 | *(yaml only)* | +| `database.postgres.pool.max` | 10 | *(yaml only)* | + +See `src/config/config.yaml` and [architecture.md](../architecture.md) for pool and pragma settings. + +--- + +## SQLite write queue backpressure + +When total queued work (`interactive` + `background` lanes) exceeds `settings.dbWriteQueueMaxDepth` (default **256**), Controller logs an **error** once per overflow episode. **Interactive requests are not rejected** — the queue continues to drain in priority order. Operators should investigate background job pressure or migrate to mysql/postgres. + +--- + +## OTEL metrics + +Instruments are registered at startup in `src/helpers/db-metrics.js` (requires `ENABLE_TELEMETRY=true`). + +| Metric | Type | Labels | Suggested alert | +|--------|------|--------|-----------------| +| `db.transaction.duration` | histogram | `label`, `priority`, `provider` | p99 spike correlated with load | +| `db.write_queue.depth` | gauge | `priority` | **> 100 for 5 min** → investigate background pressure | +| `db.write_queue.wait_ms` | histogram | `priority` | Sustained high wait → scale DB or reduce background load | +| `db.busy_retries` | counter | `label` | **> 10/min** → lock contention | +| `db.connection.invalidated` | counter | `provider` | **Any increment** → investigate pool / connection errors | +| `db.sqlite.fog_count_warning` | counter | — | Fleet exceeded sqlite recommended size | + +### Alert thresholds (summary) + +1. **`db.write_queue.depth` > 100 for 5 minutes** — background jobs or agent poll load saturating the sqlite serializer; check reconcile worker intervals and fog count. +2. **`db.busy_retries` rate > 10/minute** — sqlite lock contention; verify WAL mode and consider mysql/postgres. +3. **`db.connection.invalidated` any increase** — connection pool error or mid-transaction kill; check DB connectivity and replica health. +4. **`db.sqlite.fog_count_warning` any increase** — migrate to mysql/postgres for enterprise scale. + +--- + +## Troubleshooting + +### SQLITE_BUSY / "cannot rollback - no transaction is active" + +Caused by competing raw transactions (jobs, WS cleanup) vs fakeTransaction API path. + +Should be rare — busy retry with exponential backoff and a single priority queue serialize writes. If persistent: + +1. Check `db.write_queue.depth` and `db.busy_retries` +2. Confirm WAL mode: `PRAGMA journal_mode` → `wal` +3. Verify no long-running transaction (K8s and external I/O must run outside open transactions) +4. **Migrate to mysql/postgres** if fog count > threshold + +### Reconcile tasks not running + +1. Check `ReconcileOutbox` for rows with `processedAt IS NULL` +2. Verify outbox drainer job is running (logs on startup) +3. Check drainer `lastError` column + +### HA double reconcile (mysql/postgres) + +Should not occur with SKIP LOCKED claims. If observed, capture concurrent worker logs and verify claim tests pass. + +### API hangs ~60s then SQLITE_BUSY (SQLite) + +Typical on **`pool.max: 1`** when two writers compete for the single connection. + +| Symptom | Likely cause | Fix | +|---------|--------------|-----| +| Hang during **first fog create** / platform reconcile / **provisioning-key** (post-19-I-A) | Nested `runInTransaction()` without ALS parent (legacy explicit-tx path) or monolithic reconcile tx | Verify deployed: ALS via `runWithTransactionContext`; NATS phased txs; run `RUN_INTEGRATION=1 npm run test:integration:first-fog` | +| Hang during **first fog create** / platform reconcile / **provisioning-key** (pre-19-I) | Nested `generateTransaction`: wrapped callee called **without** `transaction` last arg (e.g. `SecretService.getSecretEndpoint(name)` inside `certificate-service.js`) | Pass parent `transaction` through all wrapped service calls; grep `certificate-service.js` for SecretService calls | +| **`SQLITE_BUSY`** on idle timer (JTI cleanup) | Job/manager raw Sequelize write bypassing `runInTransaction` | Route through `runInTransaction(..., { priority: 'background' })` | +| Hang on **OAuth interaction** login/MFA | OIDC adapter read/write while outer API tx holds connection | Mirror `auth-interaction-service.complete()`: adapter I/O **before** short DB tx | +| External IdP **login/refresh** slow under load | HTTP inside `generateTransaction` wrapper | External-mode HTTP outside tx; embedded DB paths unchanged | +| Hang on fog **NATS mode change** / fog delete (K8s CP) | Monolithic `ensureNatsForFogDb` held sqlite writer for seconds (certs + auth + mounts in one tx) | — split into `nats.ensure.certPrep` + `nats.ensure.authPrep` + `nats.ensure.topology` short txs; K8s via post-tx external helpers | +| Hang on **nats-reconcile-worker** | JWT bundle K8s patch inside reconcile tx | R-06 phased split — DB reconcile tx then external ConfigMap patch | +| Hang on **secret/configmap/registry** CRUD (vault enabled) | `SecretHelper` vault HTTP inside open Sequelize tx |— internal encrypt in tx; vault store/delete via `transaction.afterCommit` | + +**Rule:** When caller already has `transaction`, every wrapped export must receive it as the **last argument** (`generateTransaction` reuses parent tx when `lastArg instanceof Transaction`). + +### Vault I/O outside transactions + +When HashiCorp Vault is enabled, secret/configmap/registry mutations must not perform vault HTTP while a Sequelize transaction holds the sqlite connection. + +| Operation | In transaction | After commit | +|-----------|----------------|--------------| +| Create / update | Store **internal** encryption (or plaintext vault ref if already promoted) | `storeInVaultAndGetReference` + short DB patch to vault ref | +| Delete | DB row + FK cleanup | `SecretHelper.deleteSecret` | + +Helpers live in `src/helpers/vault-transaction-helper.js` (`scheduleVaultDeleteAfterCommit`, `scheduleVaultPromoteAfterCommit`). Model `beforeSave` hooks defer vault when `options.transaction` is set. Failures in deferred vault work are logged; committed DB state is not rolled back (orphan vault secrets are preferable to orphan DB rows). + +Without vault (`vaultManager.isEnabled()` false), behavior is unchanged — internal encryption only. + +--- + +## Enforcement + +Mechanical **grep gates** in `test/src/helpers/transaction-grep-gates.test.js` fail CI when transaction regressions reappear. Run: + +```bash +nvm use 24 +npm test -- --grep "grep gates" +``` + +| Gate | What it checks | +|------|----------------| +| **fakeTransaction** | Zero hits anywhere under `src/` | +| **bypassQueue** | Zero hits anywhere under `src/` | +| **sequelize.transaction** | Allowed only in `transaction-runner.js` | +| **Managers never enqueue** | Zero `runInTransaction` in `src/data/managers/` — managers accept `transaction` from callers only | +| **Certificate tx propagation** | `certificate-service.js` passes `transaction` to every `SecretService.getSecretEndpoint` call | +| **Cert utils branch** | `src/utils/cert.js` — `loadCA` / `getCAFromK8sSecret` branch on `transaction ?` before nested `runInTransaction`; `getCAFromInput` forwards `transaction` | +| **K8s outside tx (nats-service)** | DB helpers (`ensureNatsForFog*Db`, `cleanupNatsForFogDb`, `_reconcileResolverArtifactsOnceDb`) contain no `K8sClient` — K8s via external helpers after commit | +| **K8s outside tx (service-platform-service)** | Labeled `servicePlatform.*` tx blocks contain no `K8sClient` — hub router / LB sync via `applyK8sHubRouterPlan`, `reconcileK8sServiceExternal`, etc. | +| **Vault outside tx** | Secret/configmap/registry delete paths use `scheduleVaultDeleteAfterCommit` / `scheduleVaultPromoteAfterCommit`, not inline vault HTTP | +| **Volume-mount associations** | Sequelize `{ transaction }` inside association options, not as a trailing positional arg | +| **Fog platform phased reconcile** | Separate `fogPlatform.*` labels; no monolithic `fogPlatform.natsEnsure` | +| **OIDC adapter** | All adapter reads/writes through `runInTransaction` with `oidc.adapter.*` labels | +| **JTI cleanup** | `fog-token-cleanup-job.js` routes through `runInTransaction`, not bare manager call | + +When a gate fails, fix the **minimal** violation (pass parent `transaction`, move I/O outside the tx body, or split phases) — do not disable the gate. + +### OIDC provider adapter (R-16) + +`src/data/adapters/oidc-provider-adapter.js` routes all `AuthOidcProviderState` reads/writes through `runInTransaction` with **interactive** priority (background for expiry purge). OAuth BFF interaction handlers in `auth-interaction-service.js` keep adapter I/O outside short user DB transactions. Do not call adapter methods from inside another open `runInTransaction` body. + +### Certificate / secret propagation checklist + +When calling from within `reconcileFog`, `_handleRouterCertificates`, or any open transaction: + +- `CertificateService.*` → internal `SecretService.*` must pass `transaction` +- `storeCA` / `generateCertificate` in `src/utils/cert.js` must pass `transaction` to `SecretService.createSecretEndpoint` + +--- + +## Backup notes + +- **sqlite:** Back up the database file **and** `-wal` / `-shm` sidecars together, or checkpoint WAL before copy. +- **mysql/postgres:** Use standard provider backup tools; ensure migrations are at v3.8.0 before restore. + +--- + +## Related docs + +- [architecture.md](../architecture.md) — data layer overview +- [oidc-configuration.md](../oidc-configuration.md) — HA session store requires mysql/postgres + +--- diff --git a/test/vault/openbao.md b/test/vault/openbao.md index 6c90e58a..ef2242ff 100644 --- a/test/vault/openbao.md +++ b/test/vault/openbao.md @@ -24,3 +24,6 @@ api_addr = "http://127.0.0.1:8200" + + \ No newline at end of file From a8f8dfbbc731a36c8c074267e8a614716ac6d11a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 21:41:02 +0300 Subject: [PATCH 10/32] Add transaction safety tests, integration gate, and load probe scripts. Cover grep enforcement gates, vault afterCommit, first-fog sqlite SLO, chaos scenarios, and npm scripts for integration and load runs. --- package.json | 2 + scripts/test.js | 3 + test/load/transaction-safety-load.js | 419 ++++++++++++++++++ test/src/data/fog-platform-managers.test.js | 47 +- test/src/helpers/transaction-chaos.test.js | 209 +++++++++ .../helpers/transaction-grep-gates.test.js | 249 +++++++++++ .../first-fog-reconcile-sqlite.test.js | 100 +++++ .../platform-reconcile-worker-job.test.js | 111 +++-- .../rvaluesVarSubstitionMiddleware.test.js | 10 +- .../services/auth-bootstrap-service.test.js | 5 +- .../services/auth-interaction-service.test.js | 6 +- .../services/controller-ms-service.test.js | 11 + .../src/services/fog-platform-service.test.js | 80 +++- test/src/services/iofog-service.test.js | 18 +- test/src/services/microservice-port.test.js | 45 +- .../services/microservices-service.test.js | 2 +- test/src/services/nats-auth-service.test.js | 5 +- test/src/services/nats-service.test.js | 177 +++++++- test/src/services/rbac-service.test.js | 56 ++- test/src/services/registry-service.test.js | 14 +- test/src/services/router-service.test.js | 33 ++ .../services/service-bridge-config.test.js | 38 +- .../services/service-platform-service.test.js | 84 ++-- .../services/services-connector-host.test.js | 116 ----- test/src/services/services-service.test.js | 157 ++++++- .../services/transaction-safety-19h.test.js | 189 ++++++++ .../services/transaction-safety-vault.test.js | 119 +++++ .../websocket/ws-cross-replica-nats.test.js | 5 +- test/src/websocket/ws-cross-replica.test.js | 5 +- .../ws-exec-activation-failfast.test.js | 4 +- test/src/websocket/ws-lifecycle.test.js | 46 ++ test/support/first-fog-sqlite-harness.js | 147 ++++++ 32 files changed, 2243 insertions(+), 269 deletions(-) create mode 100644 test/load/transaction-safety-load.js create mode 100644 test/src/helpers/transaction-chaos.test.js create mode 100644 test/src/helpers/transaction-grep-gates.test.js create mode 100644 test/src/integration/first-fog-reconcile-sqlite.test.js create mode 100644 test/src/services/transaction-safety-19h.test.js create mode 100644 test/src/services/transaction-safety-vault.test.js create mode 100644 test/support/first-fog-sqlite-harness.js diff --git a/package.json b/package.json index 048c2f59..97e61ead 100644 --- a/package.json +++ b/package.json @@ -57,7 +57,9 @@ "test": "node scripts/run-test.js test", "test:all": "node scripts/run-test.js test-all", "test:k8s-client": "node scripts/run-test.js test test/integration/k8s-client-integration.test.js", + "test:integration:first-fog": "RUN_INTEGRATION=1 node scripts/run-test.js test test/src/integration/first-fog-reconcile-sqlite.test.js", "test:ws-load": "node test/load/ws-pairing-load.js", + "test:load:tx": "node test/load/transaction-safety-load.js --fogs 200 --soak-minutes 5", "precli-tests": "npm run lint", "cli-tests": "node scripts/run-test.js cli-tests", "precoverage": "npm run lint", diff --git a/scripts/test.js b/scripts/test.js index 2b1eb19c..69bb5240 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -13,6 +13,9 @@ function test (useReporter, extraArgs) { } options.env = setDbEnvVars(options.env) + if (process.env.RUN_INTEGRATION) { + options.env.RUN_INTEGRATION = process.env.RUN_INTEGRATION + } const mochaBin = require.resolve('mocha/bin/mocha.js') const mochaReporterOptions = '--reporter mocha-junit-reporter --reporter-options mochaFile=./unit-results.xml' diff --git a/test/load/transaction-safety-load.js b/test/load/transaction-safety-load.js new file mode 100644 index 00000000..095c6e46 --- /dev/null +++ b/test/load/transaction-safety-load.js @@ -0,0 +1,419 @@ +#!/usr/bin/env node +'use strict' + +/** + * Plan 19 transaction-safety load probe (sqlite profile). + * + * Simulates 200 fogs polling config/changes + status, 10 operator API clients, + * background reconcile outbox drainer + task claims, optional WS-style session churn. + * + * Usage: + * nvm use 24 + * node test/load/transaction-safety-load.js + * node test/load/transaction-safety-load.js --fogs 200 --soak-minutes 30 --operators 10 --poll-interval-ms 40000 + * + * Exit 0 when SLO gates pass; exit 1 otherwise. + */ + +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') + +function parseArg (name, fallback) { + const eq = process.argv.find((a) => a.startsWith(`--${name}=`)) + if (eq) return eq.split('=')[1] + const idx = process.argv.indexOf(`--${name}`) + if (idx !== -1 && process.argv[idx + 1]) return process.argv[idx + 1] + return fallback +} + +const FOG_COUNT = parseInt(parseArg('fogs', '200'), 10) +const SOAK_MINUTES = parseFloat(parseArg('soak-minutes', '30')) +const OPERATOR_COUNT = parseInt(parseArg('operators', '10'), 10) +const POLL_INTERVAL_MS = parseInt(parseArg('poll-interval-ms', '40000'), 10) +const BUSY_THRESHOLD = parseInt(parseArg('busy-threshold', '0'), 10) +const INVALIDATED_THRESHOLD = parseInt(parseArg('invalidated-threshold', '0'), 10) + +const AGENT_P99_SLO_MS = 200 +const OPERATOR_P99_SLO_MS = 1000 + +const agentLatencies = [] +const operatorLatencies = [] +const counters = { + busyRetries: 0, + connectionInvalidated: 0 +} + +let stopping = false +let dbPath +let sequelize +let originalSequelize +let originalNodeEnv + +function percentile (sorted, p) { + if (!sorted.length) return 0 + const idx = Math.ceil((p / 100) * sorted.length) - 1 + return sorted[Math.max(0, idx)] +} + +function recordLatency (bucket, ms) { + bucket.push(ms) +} + +function installMetricCounters () { + const dbMetrics = require('../../src/helpers/db-metrics') + const originalBusy = dbMetrics.recordBusyRetry + const originalInvalidated = dbMetrics.recordConnectionInvalidated + + dbMetrics.recordBusyRetry = (...args) => { + counters.busyRetries += 1 + return originalBusy(...args) + } + dbMetrics.recordConnectionInvalidated = (...args) => { + counters.connectionInvalidated += 1 + return originalInvalidated(...args) + } +} + +async function setupDatabase () { + originalNodeEnv = process.env.NODE_ENV + process.env.NODE_ENV = 'load' + delete process.env.DB_PROVIDER + + dbPath = path.join(os.tmpdir(), `controller-tx-load-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + + const { registerSqlitePragmas, applySqlitePragmas } = require('../../src/helpers/sqlite-pragmas') + registerSqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + await sequelize.authenticate() + await applySqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + + const defineFog = require('../../src/data/models/fog') + const defineChangeTracking = require('../../src/data/models/changetracking') + const defineReconcileOutbox = require('../../src/data/models/reconcileOutbox') + const defineFogPlatformReconcileTask = require('../../src/data/models/fogPlatformReconcileTask') + + const Fog = defineFog(sequelize, Sequelize.DataTypes) + const ChangeTracking = defineChangeTracking(sequelize, Sequelize.DataTypes) + const ReconcileOutbox = defineReconcileOutbox(sequelize, Sequelize.DataTypes) + const FogPlatformReconcileTask = defineFogPlatformReconcileTask(sequelize, Sequelize.DataTypes) + + await Fog.sync() + const modelBag = { Fog, ChangeTracking, ReconcileOutbox, FogPlatformReconcileTask } + if (typeof ChangeTracking.associate === 'function') { + ChangeTracking.associate(modelBag) + } + await ChangeTracking.sync() + await ReconcileOutbox.sync() + await FogPlatformReconcileTask.sync() + await sequelize.query(` + CREATE TABLE IF NOT EXISTS ws_session_sim ( + session_id TEXT PRIMARY KEY, + fog_uuid TEXT NOT NULL, + opened_at INTEGER NOT NULL + ) + `) + + const models = require('../../src/data/models') + models.Fog = Fog + models.ChangeTracking = ChangeTracking + models.ReconcileOutbox = ReconcileOutbox + models.FogPlatformReconcileTask = FogPlatformReconcileTask + models.sequelize = sequelize + + const databaseProvider = require('../../src/data/providers/database-factory') + originalSequelize = databaseProvider.sequelize + databaseProvider.sequelize = sequelize + + installMetricCounters() + + return { Fog, ChangeTracking } +} + +async function seedFogs (Fog, ChangeTracking) { + const rows = Array.from({ length: FOG_COUNT }, (_, index) => ({ + uuid: `load-fog-${String(index).padStart(4, '0')}`, + name: `Load Fog ${index}`, + daemonStatus: 'RUNNING', + memoryUsage: 10, + cpuUsage: 5 + })) + + await Fog.bulkCreate(rows) + await ChangeTracking.bulkCreate(rows.map((row) => ({ iofogUuid: row.uuid }))) + return rows.map((row) => row.uuid) +} + +async function agentConfigChanges (ChangeTrackingManager, fogUuid, runInTransaction, PRIORITY_INTERACTIVE) { + const start = Date.now() + await runInTransaction(async (transaction) => { + await ChangeTrackingManager.findAll({ iofogUuid: fogUuid }, transaction) + }, { priority: PRIORITY_INTERACTIVE, label: 'agent.configChanges' }) + recordLatency(agentLatencies, Date.now() - start) +} + +async function agentStatusPut (FogManager, fogUuid, runInTransaction, PRIORITY_INTERACTIVE) { + const start = Date.now() + await runInTransaction(async (transaction) => { + await FogManager.update({ uuid: fogUuid }, { + memoryUsage: Math.random() * 100, + cpuUsage: Math.random() * 100, + diskUsage: Math.random() * 100, + lastStatusTime: Date.now() + }, transaction) + }, { priority: PRIORITY_INTERACTIVE, label: 'agent.status' }) + recordLatency(agentLatencies, Date.now() - start) +} + +async function operatorRead (FogManager, runInTransaction, PRIORITY_INTERACTIVE) { + const start = Date.now() + await runInTransaction(async (transaction) => { + await FogManager.findAll({}, transaction) + }, { priority: PRIORITY_INTERACTIVE, label: 'operator.listFogs' }) + recordLatency(operatorLatencies, Date.now() - start) +} + +async function operatorMutate (ReconcileOutboxManager, fogUuid, generation, runInTransaction, PRIORITY_INTERACTIVE) { + const start = Date.now() + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid, + reason: 'spec-changed', + specGeneration: generation + }, transaction) + }, { priority: PRIORITY_INTERACTIVE, label: 'operator.enqueueReconcile' }) + recordLatency(operatorLatencies, Date.now() - start) +} + +async function wsSessionChurn (fogUuid, runInTransaction, PRIORITY_INTERACTIVE) { + const sessionId = `${fogUuid}-${Date.now()}-${Math.random()}` + await runInTransaction(async (transaction) => { + await sequelize.query( + 'INSERT INTO ws_session_sim (session_id, fog_uuid, opened_at) VALUES (:sessionId, :fogUuid, :openedAt)', + { + replacements: { sessionId, fogUuid, openedAt: Date.now() }, + transaction + } + ) + await sequelize.query( + 'DELETE FROM ws_session_sim WHERE session_id = :sessionId', + { replacements: { sessionId }, transaction } + ) + }, { priority: PRIORITY_INTERACTIVE, label: 'ws.sessionChurn' }) +} + +function startAgentSimulators (fogUuids, deps) { + const timers = [] + + fogUuids.forEach((fogUuid, index) => { + const staggerMs = Math.floor((index / fogUuids.length) * POLL_INTERVAL_MS) + const tick = async () => { + if (stopping) return + try { + await agentConfigChanges(deps.ChangeTrackingManager, fogUuid, deps.runInTransaction, deps.PRIORITY_INTERACTIVE) + await agentStatusPut(deps.FogManager, fogUuid, deps.runInTransaction, deps.PRIORITY_INTERACTIVE) + if (Math.random() < 0.02) { + await wsSessionChurn(fogUuid, deps.runInTransaction, deps.PRIORITY_INTERACTIVE) + } + } catch (error) { + console.error(`Agent simulator error (${fogUuid}):`, error.message) + } + if (!stopping) { + timers.push(setTimeout(tick, POLL_INTERVAL_MS)) + } + } + timers.push(setTimeout(tick, staggerMs)) + }) + + return () => timers.forEach(clearTimeout) +} + +function startOperatorSimulators (fogUuids, deps) { + const timers = [] + let generation = 1 + + for (let operator = 0; operator < OPERATOR_COUNT; operator++) { + const tick = async () => { + if (stopping) return + try { + if (Math.random() < 0.12) { + const fogUuid = fogUuids[Math.floor(Math.random() * fogUuids.length)] + generation += 1 + await operatorMutate(deps.ReconcileOutboxManager, fogUuid, generation, deps.runInTransaction, deps.PRIORITY_INTERACTIVE) + } else { + await operatorRead(deps.FogManager, deps.runInTransaction, deps.PRIORITY_INTERACTIVE) + } + } catch (error) { + console.error(`Operator simulator error (${operator}):`, error.message) + } + if (!stopping) { + timers.push(setTimeout(tick, 250 + Math.floor(Math.random() * 500))) + } + } + timers.push(setTimeout(tick, operator * 100)) + } + + return () => timers.forEach(clearTimeout) +} + +function startBackgroundWorkers (deps) { + const { drainOnce } = require('../../src/jobs/reconcile-outbox-drainer-job') + const drainerTimer = setInterval(async () => { + if (stopping) return + try { + await drainOnce() + } catch (error) { + console.error('Outbox drainer error:', error.message) + } + }, 3000) + + const claimTimer = setInterval(async () => { + if (stopping) return + try { + await deps.runInTransaction(async () => { + await deps.FogPlatformReconcileTaskManager.claimNextFogTask('tx-load-worker', 300) + }, { priority: deps.PRIORITY_BACKGROUND, label: 'reconcile.claim' }) + } catch (error) { + console.error('Reconcile claim error:', error.message) + } + }, 5000) + + return () => { + clearInterval(drainerTimer) + clearInterval(claimTimer) + } +} + +function reportResults () { + agentLatencies.sort((a, b) => a - b) + operatorLatencies.sort((a, b) => a - b) + + const agentP50 = percentile(agentLatencies, 50) + const agentP95 = percentile(agentLatencies, 95) + const agentP99 = percentile(agentLatencies, 99) + const operatorP50 = percentile(operatorLatencies, 50) + const operatorP95 = percentile(operatorLatencies, 95) + const operatorP99 = percentile(operatorLatencies, 99) + + const sloPass = agentLatencies.length > 0 && + operatorLatencies.length > 0 && + agentP99 < AGENT_P99_SLO_MS && + operatorP99 < OPERATOR_P99_SLO_MS && + counters.busyRetries <= BUSY_THRESHOLD && + counters.connectionInvalidated <= INVALIDATED_THRESHOLD + + console.log('') + console.log('Transaction safety load probe — results') + console.log(` fogs: ${FOG_COUNT}`) + console.log(` operators: ${OPERATOR_COUNT}`) + console.log(` soak minutes: ${SOAK_MINUTES}`) + console.log(` poll interval ms: ${POLL_INTERVAL_MS}`) + console.log('') + console.log('Agent latencies (config/changes + status):') + console.log(` samples: ${agentLatencies.length}`) + console.log(` p50: ${agentP50} ms`) + console.log(` p95: ${agentP95} ms`) + console.log(` p99: ${agentP99} ms (SLO < ${AGENT_P99_SLO_MS} ms)`) + console.log('') + console.log('Operator latencies (read-heavy + mutations):') + console.log(` samples: ${operatorLatencies.length}`) + console.log(` p50: ${operatorP50} ms`) + console.log(` p95: ${operatorP95} ms`) + console.log(` p99: ${operatorP99} ms (SLO < ${OPERATOR_P99_SLO_MS} ms)`) + console.log('') + console.log('Contention counters:') + console.log(` busy retries: ${counters.busyRetries} (threshold <= ${BUSY_THRESHOLD})`) + console.log(` connection invalidated: ${counters.connectionInvalidated} (threshold <= ${INVALIDATED_THRESHOLD})`) + console.log('') + console.log(` status: ${sloPass ? 'PASS' : 'FAIL'}`) + + return sloPass +} + +async function cleanup () { + const databaseProvider = require('../../src/data/providers/database-factory') + if (originalSequelize) { + databaseProvider.sequelize = originalSequelize + } + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + if (originalNodeEnv === undefined) { + delete process.env.NODE_ENV + } else { + process.env.NODE_ENV = originalNodeEnv + } +} + +async function main () { + const startedAt = Date.now() + const soakMs = SOAK_MINUTES * 60 * 1000 + + console.log('Transaction safety load probe — starting') + console.log(` profile: sqlite, ${FOG_COUNT} fogs, ${OPERATOR_COUNT} operators, ${SOAK_MINUTES} min soak`) + + const { Fog, ChangeTracking } = await setupDatabase() + const fogUuids = await seedFogs(Fog, ChangeTracking) + + const { + runInTransaction, + PRIORITY_BACKGROUND, + PRIORITY_INTERACTIVE, + _resetQueueForTests + } = require('../../src/helpers/transaction-runner') + + const deps = { + FogManager: require('../../src/data/managers/iofog-manager'), + ChangeTrackingManager: require('../../src/data/managers/change-tracking-manager'), + ReconcileOutboxManager: require('../../src/data/managers/reconcile-outbox-manager'), + FogPlatformReconcileTaskManager: require('../../src/data/managers/fog-platform-reconcile-task-manager'), + runInTransaction, + PRIORITY_BACKGROUND, + PRIORITY_INTERACTIVE + } + + const stopAgents = startAgentSimulators(fogUuids, deps) + const stopOperators = startOperatorSimulators(fogUuids, deps) + const stopBackground = startBackgroundWorkers(deps) + + await new Promise((resolve) => setTimeout(resolve, soakMs)) + + stopping = true + stopAgents() + stopOperators() + stopBackground() + _resetQueueForTests() + + const pass = reportResults() + console.log(` runtime: ${((Date.now() - startedAt) / 1000).toFixed(1)} s`) + + await cleanup() + process.exit(pass ? 0 : 1) +} + +main().catch(async (error) => { + console.error(error) + try { + await cleanup() + } catch (_) { /* ignore */ } + process.exit(1) +}) diff --git a/test/src/data/fog-platform-managers.test.js b/test/src/data/fog-platform-managers.test.js index d0a221d2..f7e8e4fd 100644 --- a/test/src/data/fog-platform-managers.test.js +++ b/test/src/data/fog-platform-managers.test.js @@ -108,25 +108,60 @@ describe('Fog platform reconcile task enqueue', () => { expect(task).to.eql(created) }) - it('supersedes pending work with delete reason', async () => { + it('preempts in_progress tasks when delete is enqueued', async () => { const existing = { id: 9, fogUuid: 'fog-3', reason: 'spec-changed', status: 'in_progress' } const entity = { findOne: $sandbox.stub().resolves(existing), update: $sandbox.stub().resolves([1]) } + const preempted = { + id: 9, + fogUuid: 'fog-3', + reason: 'delete', + status: 'pending', + leaderUuid: null, + claimedAt: null + } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves(preempted) + + const task = await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: 'fog-3', + reason: 'delete' + }, transaction) + + expect(entity.update).to.have.been.calledOnceWith({ + reason: 'delete', + specGeneration: null, + status: 'pending', + leaderUuid: null, + claimedAt: null, + nextAttemptAt: null, + attempts: 0, + lastError: null + }, sinon.match.has('where', { id: 9 })) + expect(task).to.eql(preempted) + }) + + it('updates pending tasks to delete without resetting claim state', async () => { + const existing = { id: 10, fogUuid: 'fog-4', reason: 'spec-changed', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(existing), + update: $sandbox.stub().resolves([1]) + } $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves({ ...existing, reason: 'delete' }) await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ - fogUuid: 'fog-3', + fogUuid: 'fog-4', reason: 'delete' }, transaction) - expect(entity.update).to.have.been.calledWithMatch( - { reason: 'delete' }, - sinon.match.has('where', { id: 9 }) - ) + const updateArg = entity.update.getCall(0).args[0] + expect(updateArg.reason).to.equal('delete') + expect(updateArg.status).to.be.undefined }) }) diff --git a/test/src/helpers/transaction-chaos.test.js b/test/src/helpers/transaction-chaos.test.js new file mode 100644 index 00000000..f31ad377 --- /dev/null +++ b/test/src/helpers/transaction-chaos.test.js @@ -0,0 +1,209 @@ +'use strict' + +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') +const sinon = require('sinon') + +const databaseProvider = require('../../../src/data/providers/database-factory') +const defineReconcileOutbox = require('../../../src/data/models/reconcileOutbox') +const defineFogPlatformReconcileTask = require('../../../src/data/models/fogPlatformReconcileTask') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') +const { runInTransaction } = require('../../../src/helpers/transaction-runner') +const { drainOnce } = require('../../../src/jobs/reconcile-outbox-drainer-job') + +describe('transaction chaos', () => { + const sandbox = sinon.createSandbox() + + describe('connection kill mid-transaction', () => { + let sequelize + let dbPath + + beforeEach(async () => { + dbPath = path.join(os.tmpdir(), `controller-chaos-kill-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + await sequelize.authenticate() + await sequelize.query(` + CREATE TABLE chaos_multi ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + step INTEGER NOT NULL + ) + `) + }) + + afterEach(async () => { + if (sequelize) { + try { + await sequelize.close() + } catch (_) { /* ignore */ } + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + }) + + it('does not leave partial multi-row state when the connection is killed mid-transaction', async function () { + this.timeout(15000) + + try { + await sequelize.transaction(async (transaction) => { + await sequelize.query('INSERT INTO chaos_multi (step) VALUES (1)', { transaction }) + await sequelize.connectionManager.close() + await sequelize.query('INSERT INTO chaos_multi (step) VALUES (2)', { transaction }) + }) + } catch (_) { + // Expected: connection teardown aborts the open transaction. + } + + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + await sequelize.authenticate() + + const [rows] = await sequelize.query('SELECT step FROM chaos_multi ORDER BY step ASC') + expect(rows).to.deep.equal([]) + }) + }) + + describe('duplicate outbox drainer', () => { + let sequelize + let dbPath + let ReconcileOutbox + let FogPlatformReconcileTask + + beforeEach(async () => { + dbPath = path.join(os.tmpdir(), `controller-chaos-drainer-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + await sequelize.authenticate() + + ReconcileOutbox = defineReconcileOutbox(sequelize, Sequelize.DataTypes) + FogPlatformReconcileTask = defineFogPlatformReconcileTask(sequelize, Sequelize.DataTypes) + await ReconcileOutbox.sync() + await FogPlatformReconcileTask.sync() + + const models = require('../../../src/data/models') + models.ReconcileOutbox = ReconcileOutbox + models.FogPlatformReconcileTask = FogPlatformReconcileTask + models.sequelize = sequelize + sandbox.stub(databaseProvider, 'sequelize').value(sequelize) + }) + + afterEach(async () => { + sandbox.restore() + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + }) + + it('creates a single reconcile task when two drain ticks run in parallel', async () => { + await runInTransaction(async (transaction) => { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: 'fog-chaos-1', + reason: 'manual-retry', + specGeneration: 9 + }, transaction) + }) + + const [resultA, resultB] = await Promise.all([drainOnce(), drainOnce()]) + const processed = (resultA.processed || 0) + (resultB.processed || 0) + expect(processed).to.be.at.least(1) + + const tasks = await FogPlatformReconcileTask.findAll({ + where: { fogUuid: 'fog-chaos-1' } + }) + expect(tasks).to.have.length(1) + + const outboxRow = await ReconcileOutbox.findOne({ + where: { idempotencyKey: 'fp:fog-chaos-1:manual-retry:9' } + }) + expect(outboxRow.processedAt).to.not.be.null + }) + }) + + describe('mysql/postgres HA claim (integration)', function () { + const haUrl = process.env.RECONCILE_CLAIM_HA_URL + + before(function () { + if (!haUrl) { + this.skip() + } + }) + + it('claims each task at most once with two parallel connections', async function () { + this.timeout(30000) + + const dialect = process.env.RECONCILE_CLAIM_HA_DIALECT || 'postgres' + const sequelizeA = new Sequelize(haUrl, { dialect, logging: false }) + const sequelizeB = new Sequelize(haUrl, { dialect, logging: false }) + + const FogTaskA = defineFogPlatformReconcileTask(sequelizeA, Sequelize.DataTypes) + await FogTaskA.sync({ force: true }) + + await FogTaskA.bulkCreate([ + { fogUuid: 'fog-chaos-ha-1', reason: 'spec-changed', status: 'pending' }, + { fogUuid: 'fog-chaos-ha-2', reason: 'spec-changed', status: 'pending' } + ]) + + const claimWithConnection = async (conn, controllerUuid) => { + return conn.transaction(async (transaction) => { + const rows = await conn.query( + `SELECT id FROM "FogPlatformReconcileTasks" + WHERE status IN ('pending', 'in_progress') + AND (leader_uuid IS NULL) + ORDER BY id ASC + LIMIT 1 + FOR UPDATE SKIP LOCKED`, + { type: conn.QueryTypes.SELECT, transaction } + ) + if (!rows.length) { + return null + } + await conn.query( + `UPDATE "FogPlatformReconcileTasks" + SET leader_uuid = :leader, claimed_at = NOW(), status = 'in_progress' + WHERE id = :id AND leader_uuid IS NULL`, + { + replacements: { leader: controllerUuid, id: rows[0].id }, + transaction + } + ) + return rows[0].id + }) + } + + const [idA, idB] = await Promise.all([ + claimWithConnection(sequelizeA, 'replica-a'), + claimWithConnection(sequelizeB, 'replica-b') + ]) + + expect(idA).to.be.a('number') + expect(idB).to.be.a('number') + expect(idA).to.not.equal(idB) + + await sequelizeA.close() + await sequelizeB.close() + }) + }) +}) diff --git a/test/src/helpers/transaction-grep-gates.test.js b/test/src/helpers/transaction-grep-gates.test.js new file mode 100644 index 00000000..29bb1391 --- /dev/null +++ b/test/src/helpers/transaction-grep-gates.test.js @@ -0,0 +1,249 @@ +'use strict' + +const { expect } = require('chai') +const { execFileSync } = require('child_process') +const fs = require('fs') +const path = require('path') + +const REPO_ROOT = path.resolve(__dirname, '../../..') + +function grepSrc (pattern, extraArgs = [], searchPath = 'src/') { + try { + return execFileSync('grep', [ + '-R', + '-n', + '--include=*.js', + ...extraArgs, + pattern, + searchPath + ], { + cwd: REPO_ROOT, + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'pipe'] + }).trim() + } catch (error) { + if (error.status === 1) { + return '' + } + throw error + } +} + +/** + * Plan 19-I-C: K8s I/O must run outside runInTransaction callback bodies. + * Allowed external helpers (called after tx commit / between phases): + * nats-service: _applyEnsureNatsK8sExternal, _patchK8sHubConfigMapClusterRoutesExternal, + * _patchK8sJwtBundleExternal + * service-platform-service: applyK8sHubRouterPlan, reconcileK8sServiceExternal, + * watchLoadBalancerWithTimeout, ServicesService._syncK8sServiceResource + */ +function assertNoK8sClientInLabeledTxBlocks (source, labels) { + for (const label of labels) { + const labelToken = `label: '${label}'` + const labelIdx = source.indexOf(labelToken) + expect(labelIdx, `expected runInTransaction block with label ${label}`).to.be.greaterThan(-1) + const txStart = source.lastIndexOf('runInTransaction', labelIdx) + expect(txStart, `expected runInTransaction before label ${label}`).to.be.greaterThan(-1) + const block = source.slice(txStart, labelIdx + labelToken.length) + expect(block).to.not.match(/K8sClient\./, `K8sClient must not appear inside tx block ${label}`) + } +} + +describe('grep gates', () => { + it('has zero fakeTransaction hits in src/', () => { + expect(grepSrc('fakeTransaction')).to.equal('') + }) + + it('has zero bypassQueue hits in src/', () => { + expect(grepSrc('bypassQueue')).to.equal('') + }) + + it('has zero runInTransaction hits in src/data/managers/', () => { + expect(grepSrc('runInTransaction', [], 'src/data/managers/')).to.equal('') + }) + + it('allows sequelize.transaction only in transaction-runner.js', () => { + const hits = grepSrc('sequelize\\.transaction', ['--exclude=transaction-runner.js']) + expect(hits).to.equal('') + }) + + it('passes transaction to SecretService reads inside certificate-service.js', () => { + const hits = grepSrc('SecretService\\.getSecretEndpoint\\([^,\n]+\\)', [ + '--include=certificate-service.js', + 'src/services' + ]) + expect(hits).to.equal('') + }) + + it('routes fog-token cleanup through runInTransaction', () => { + const jobSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/jobs/fog-token-cleanup-job.js'), + 'utf8' + ) + expect(jobSource).to.include('runInTransaction') + expect(jobSource).to.not.match(/FogUsedTokenManager\.cleanupExpiredJtis\(\)/) + }) + + it('threads transaction through cert.js CA load paths without bare nested enqueue', () => { + const certSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/utils/cert.js'), + 'utf8' + ) + expect(certSource).to.match(/async function loadCA \(name, transaction\)/) + expect(certSource).to.match(/async function getCAFromK8sSecret \(secretName, transaction\)/) + expect(certSource).to.match(/async function getCAFromInput \(ca, transaction\)/) + expect(certSource).to.match(/getCAFromInput\(ca, transaction\)/) + expect(certSource).to.match(/loadCA\(ca\.secretName, transaction\)/) + expect(certSource).to.match(/getCAFromK8sSecret\(ca\.secretName, transaction\)/) + expect(certSource).to.match(/await storeCA\(\{ cert, key \}, secretName, transaction\)/) + expect(certSource).to.not.match(/await loadCA\(ca\.secretName\)\s/) + expect(certSource).to.not.match(/await getCAFromK8sSecret\(ca\.secretName\)\s/) + // Branch on caller transaction before enqueueing nested runInTransaction (R126–R128) + expect(certSource).to.match(/async function loadCA[\s\S]*?const secret = transaction[\s\S]*?\? await SecretManager\.getSecret\(name, transaction\)[\s\S]*?: await runInTransaction/) + expect(certSource).to.match(/async function getCAFromK8sSecret[\s\S]*?const localSecret = transaction[\s\S]*?\? await SecretManager\.findOne\(\{ name: secretName \}, transaction\)[\s\S]*?: await runInTransaction/) + expect(certSource).to.match(/async function getCAFromK8sSecret[\s\S]*?if \(transaction\) \{[\s\S]*?await CertificateManager\.createCertificateRecord\(caRecord, transaction\)[\s\S]*?\} else \{[\s\S]*?await runInTransaction/) + }) + + it('passes transaction to cert util calls inside certificate-service createCAEndpoint', () => { + const serviceSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/certificate-service.js'), + 'utf8' + ) + expect(serviceSource).to.match(/getCAFromK8sSecret\(caData\.secretName, transaction\)/) + expect(serviceSource).to.match(/loadCA\(caData\.secretName, transaction\)/) + }) + + it('keeps K8sClient calls out of nats-service DB transaction bodies', () => { + const natsSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/nats-service.js'), + 'utf8' + ) + expect(natsSource).to.match(/async function ensureNatsForFogDb/) + expect(natsSource).to.match(/async function ensureNatsForFogAuthPrepDb/) + expect(natsSource).to.match(/async function ensureNatsForFogTopologyDb/) + expect(natsSource).to.match(/async function ensureNatsForFogDbMutation/) + expect(natsSource).to.match(/label: 'nats\.ensure\.certPrep'/) + expect(natsSource).to.match(/label: 'nats\.ensure\.authPrep'/) + expect(natsSource).to.match(/label: 'nats\.ensure\.topology'/) + expect(natsSource).to.match(/async function cleanupNatsForFogDb/) + expect(natsSource).to.match(/async function _reconcileResolverArtifactsOnceDb/) + expect(natsSource).to.match(/_patchK8sHubConfigMapClusterRoutesExternal/) + expect(natsSource).to.match(/_patchK8sJwtBundleExternal/) + expect(natsSource).to.not.match(/ensureNatsForFogAuthPrepDb[\s\S]*?K8sClient\./) + expect(natsSource).to.not.match(/ensureNatsForFogTopologyDb[\s\S]*?K8sClient\./) + expect(natsSource).to.not.match(/ensureNatsForFogDbMutation[\s\S]*?K8sClient\./) + expect(natsSource).to.not.match(/ensureNatsForFogDb[\s\S]*?K8sClient\./) + expect(natsSource).to.not.match(/cleanupNatsForFogDb[\s\S]*?K8sClient\./) + expect(natsSource).to.not.match(/_reconcileResolverArtifactsOnceDb[\s\S]*?K8sClient\./) + }) + + it('keeps K8sClient calls out of service-platform-service DB transaction bodies', () => { + const platformSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/service-platform-service.js'), + 'utf8' + ) + expect(platformSource).to.match(/async function reconcileK8sServiceExternal/) + expect(platformSource).to.match(/async function applyK8sHubRouterPlan/) + assertNoK8sClientInLabeledTxBlocks(platformSource, [ + 'servicePlatform.hubLockAcquire', + 'servicePlatform.hubLockRelease', + 'servicePlatform.k8sLoadBalancerEndpoint', + 'servicePlatform.prepare', + 'servicePlatform.hubReconcile', + 'servicePlatform.hubDb', + 'servicePlatform.finalize' + ]) + }) + + it('passes Sequelize transaction inside options for volume-mount association calls', () => { + const volumeMountSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/volume-mount-service.js'), + 'utf8' + ) + expect(volumeMountSource).to.match(/getFogs\(\{ transaction \}\)/) + expect(volumeMountSource).to.match(/addVolumeMount\(volumeMount, \{ transaction \}\)/) + expect(volumeMountSource).to.match(/removeVolumeMount\(volumeMount, \{ transaction \}\)/) + expect(volumeMountSource).to.not.match(/getFogs\(\{\}, transaction\)/) + expect(volumeMountSource).to.not.match(/addVolumeMount\(volumeMount\.uuid, transaction\)/) + expect(volumeMountSource).to.not.match(/removeVolumeMount\(volumeMount\.uuid, transaction\)/) + }) + + it('keeps vault HTTP out of secret/configmap/registry transaction bodies', () => { + const secretSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/secret-service.js'), + 'utf8' + ) + expect(secretSource).to.include('scheduleVaultDeleteAfterCommit') + expect(secretSource).to.not.match(/deleteSecretEndpoint[\s\S]*?SecretHelper\.deleteSecret/) + + const configMapManagerSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/data/managers/config-map-manager.js'), + 'utf8' + ) + expect(configMapManagerSource).to.include('scheduleVaultDeleteAfterCommit') + expect(configMapManagerSource).to.not.match(/deleteConfigMap[\s\S]*?SecretHelper\.deleteSecret/) + + const registryServiceSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/registry-service.js'), + 'utf8' + ) + expect(registryServiceSource).to.include('scheduleVaultPromoteAfterCommit') + expect(registryServiceSource).to.include('scheduleVaultDeleteAfterCommit') + expect(registryServiceSource).to.not.match(/createRegistry[\s\S]*?SecretHelper\.encryptSecret\(/) + }) + + it('splits fog platform reconcile into phased runInTransaction labels', () => { + const fogSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/fog-platform-service.js'), + 'utf8' + ) + expect(fogSource).to.match(/label: 'fogPlatform\.prepare'/) + expect(fogSource).to.match(/label: 'fogPlatform\.certPrep'/) + expect(fogSource).to.match(/ensureNatsForFogPhased/) + expect(fogSource).to.match(/label: 'fogPlatform\.platform'/) + expect(fogSource).to.match(/label: 'fogPlatform\.finalize'/) + expect(fogSource).to.not.match(/label: 'fogPlatform\.natsEnsure'/) + expect(fogSource).to.not.match(/reconcileFog: TransactionDecorator\.generateTransaction/) + }) + + it('passes transaction inside Sequelize options in nats-instance-manager.js', () => { + const source = fs.readFileSync( + path.join(REPO_ROOT, 'src/data/managers/nats-instance-manager.js'), + 'utf8' + ) + expect(source).to.not.match(/findOne\(\{ where: \{ iofogUuid \} \}, \{ transaction \}\)/) + expect(source).to.match(/transaction\s*\n\s*\}\)/) + }) + + it('does not export dead K8s-in-tx TCP bridge helpers from services-service.js', () => { + const servicesSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/services/services-service.js'), + 'utf8' + ) + expect(servicesSource).to.not.match(/function _addTcpConnector/) + expect(servicesSource).to.not.match(/function _addTcpListener/) + expect(servicesSource).to.not.match(/function _updateTcpConnector/) + expect(servicesSource).to.not.match(/function _deleteTcpConnector/) + expect(servicesSource).to.not.match(/function _deleteTcpListener/) + expect(servicesSource).to.not.match(/_addTcpConnector,/) + }) + + it('routes OIDC provider adapter through runInTransaction', () => { + const adapterSource = fs.readFileSync( + path.join(REPO_ROOT, 'src/data/adapters/oidc-provider-adapter.js'), + 'utf8' + ) + expect(adapterSource).to.include('runInTransaction') + expect(adapterSource).to.match(/label: 'oidc\.adapter\.upsert'/) + expect(adapterSource).to.match(/\}, \{ transaction \}\)/) + }) + + it('passes transaction inside Sequelize options in volume-mounting-manager.js', () => { + const source = fs.readFileSync( + path.join(REPO_ROOT, 'src/data/managers/volume-mounting-manager.js'), + 'utf8' + ) + expect(source).to.not.match(/findOne\(\{[\s\S]*?\}, \{ transaction \}\)/) + expect(source).to.not.match(/findAll\(\{[\s\S]*?\}, \{ transaction \}\)/) + }) +}) diff --git a/test/src/integration/first-fog-reconcile-sqlite.test.js b/test/src/integration/first-fog-reconcile-sqlite.test.js new file mode 100644 index 00000000..96fba581 --- /dev/null +++ b/test/src/integration/first-fog-reconcile-sqlite.test.js @@ -0,0 +1,100 @@ +'use strict' + +/** + * Plan 19-I-D — first-fog sqlite integration gate (R133). + * + * Skipped in default `npm test` unless RUN_INTEGRATION=1 (full DB migrate + reconcile). + * + * RUN_INTEGRATION=1 npm run test:integration:first-fog + */ + +describe('first-fog reconcile sqlite (R133)', function () { + this.timeout(30000) + + let harness + let busyRetries = 0 + let restoreBusyCounter + + before(function () { + if (process.env.RUN_INTEGRATION !== '1') { + this.skip() + } + }) + + before(async function () { + const { + createFirstFogSqliteHarness, + installBusyRetryCounter + } = require('../../support/first-fog-sqlite-harness') + + busyRetries = 0 + restoreBusyCounter = installBusyRetryCounter(() => { + busyRetries += 1 + }) + harness = await createFirstFogSqliteHarness() + }) + + after(async function () { + if (restoreBusyCounter) { + restoreBusyCounter() + } + if (harness) { + await harness.teardown() + } + }) + + it('reconciles first fog to Ready while concurrent operator API completes under 2s', async function () { + const { expect } = require('chai') + const { runInTransaction, PRIORITY_INTERACTIVE } = require('../../../src/helpers/transaction-runner') + const IofogService = require('../../../src/services/iofog-service') + const UserService = require('../../../src/services/user-service') + const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') + const { drainOnce } = require('../../../src/jobs/reconcile-outbox-drainer-job') + const PlatformReconcileWorkerJob = require('../../../src/jobs/platform-reconcile-worker-job') + const { driveReconcileUntilReady } = require('../../support/first-fog-sqlite-harness') + + const fogPayload = { + name: 'hub-edge', + host: '127.0.0.1', + archId: 1, + containerEngine: 'edgelet', + bluetoothEnabled: false, + abstractedHardwareEnabled: false + } + + let fogUuid + let concurrentElapsedMs + + const createFogPromise = runInTransaction( + (transaction) => IofogService.createFogEndPoint(fogPayload, false, transaction), + { priority: PRIORITY_INTERACTIVE, label: 'integration.createFog' } + ).then((result) => { + fogUuid = result.uuid + }) + + const concurrentStart = Date.now() + const concurrentPromise = Promise.all([ + UserService.login({ email: 'admin', password: harness.bootstrapPassword }, false), + runInTransaction( + (transaction) => IofogService.getFogListEndPoint([], false, transaction), + { priority: PRIORITY_INTERACTIVE, label: 'integration.iofogList' } + ) + ]).then(() => { + concurrentElapsedMs = Date.now() - concurrentStart + }) + + await Promise.all([createFogPromise, concurrentPromise]) + + expect(concurrentElapsedMs).to.be.lessThan(2000) + + const status = await driveReconcileUntilReady(fogUuid, { + drainOnce, + processNextFogTask: PlatformReconcileWorkerJob.processNextFogTask, + getStatus: (uuid) => FogPlatformStatusManager.getParsedStatus(uuid) + }) + + expect(status.phase).to.equal('Ready') + expect(status.lastError).to.satisfy((value) => value == null || value === '') + expect(busyRetries).to.equal(0) + }) +}) diff --git a/test/src/jobs/platform-reconcile-worker-job.test.js b/test/src/jobs/platform-reconcile-worker-job.test.js index 2b4b199b..8086f9a7 100644 --- a/test/src/jobs/platform-reconcile-worker-job.test.js +++ b/test/src/jobs/platform-reconcile-worker-job.test.js @@ -5,12 +5,17 @@ const ClusterControllerService = require('../../../src/services/cluster-controll const FogPlatformService = require('../../../src/services/fog-platform-service') const ServicePlatformService = require('../../../src/services/service-platform-service') const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') const ServiceManager = require('../../../src/data/managers/service-manager') const FogManager = require('../../../src/data/managers/iofog-manager') -const databaseProvider = require('../../../src/data/providers/database-factory') +const transactionRunner = require('../../../src/helpers/transaction-runner') const PlatformReconcileWorkerJob = require('../../../src/jobs/platform-reconcile-worker-job') +function stubRunInTransaction (sandbox, transaction = {}) { + sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn) => fn(transaction)) +} + describe('platform-reconcile-worker-job', () => { def('sandbox', () => sinon.createSandbox()) @@ -23,10 +28,11 @@ describe('platform-reconcile-worker-job', () => { $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Ready' }) $sandbox.stub(FogPlatformService, 'reconcileFog').resolves({ fogUuid: 'fog-1', phase: 'Ready' }) $sandbox.stub(FogPlatformService, 'reconcileFogDelete') $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + stubRunInTransaction($sandbox, transaction) await PlatformReconcileWorkerJob.processNextFogTask() @@ -34,36 +40,31 @@ describe('platform-reconcile-worker-job', () => { expect(FogPlatformService.reconcileFogDelete).to.not.have.been.called expect(entity.destroy).to.have.been.calledOnceWith({ where: { id: 11 }, - transaction + transaction: sinon.match.any }) }) - it('passes fakeTransaction into reconcileFog DB layer from worker (no reconcileFog stub)', async () => { + it('passes transaction into reconcileFog prepare phase from worker (no reconcileFog stub)', async () => { const task = { id: 14, fogUuid: 'fog-1', reason: 'spec-changed', attempts: 0 } - const appHelperPath = require.resolve('../../../src/helpers/app-helper') - const decoratorPath = require.resolve('../../../src/decorators/transaction-decorator') - const fogPlatformServicePath = require.resolve('../../../src/services/fog-platform-service') - const workerPath = require.resolve('../../../src/jobs/platform-reconcile-worker-job') - - $sandbox.stub(require(appHelperPath), 'isTest').returns(false) - delete require.cache[decoratorPath] - delete require.cache[fogPlatformServicePath] - delete require.cache[workerPath] - const WorkerJob = require('../../../src/jobs/platform-reconcile-worker-job') + const labels = [] $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Progressing' }) $sandbox.stub(FogManager, 'findOneWithTags').resolves(null) $sandbox.stub(FogPlatformReconcileTaskManager, 'recordFogTaskFailure').resolves(task) - const markFailedPath = require.resolve('../../../src/services/fog-platform-service') - $sandbox.stub(require(markFailedPath), 'markReconcileFailed').resolves() - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + $sandbox.stub(FogPlatformService, 'markReconcileFailed').resolves() + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn, options = {}) => { + labels.push(options.label) + return fn({ id: 'worker-tx' }) + }) - await WorkerJob.processNextFogTask() + await PlatformReconcileWorkerJob.processNextFogTask() + expect(labels).to.include('fogPlatform.prepare') expect(FogManager.findOneWithTags).to.have.been.calledOnceWith( { uuid: 'fog-1' }, - sinon.match({ fakeTransaction: true }) + { id: 'worker-tx' } ) }) @@ -76,7 +77,7 @@ describe('platform-reconcile-worker-job', () => { $sandbox.stub(FogPlatformService, 'reconcileFogDelete').resolves({ fogUuid: 'fog-2', deleted: true }) $sandbox.stub(FogPlatformService, 'reconcileFog') $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + stubRunInTransaction($sandbox) await PlatformReconcileWorkerJob.processNextFogTask() @@ -84,16 +85,76 @@ describe('platform-reconcile-worker-job', () => { expect(FogPlatformService.reconcileFog).to.not.have.been.called }) + it('runs delete reconcile when platform phase is Deleting even if task reason is spec-changed', async () => { + const task = { id: 15, fogUuid: 'fog-4', reason: 'spec-changed', attempts: 0 } + const entity = { destroy: $sandbox.stub().resolves(1) } + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Deleting' }) + $sandbox.stub(FogPlatformService, 'reconcileFogDelete').resolves({ fogUuid: 'fog-4', deleted: true }) + $sandbox.stub(FogPlatformService, 'reconcileFog') + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + stubRunInTransaction($sandbox) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformService.reconcileFogDelete).to.have.been.calledOnceWith('fog-4') + expect(FogPlatformService.reconcileFog).to.not.have.been.called + }) + + it('runs delete reconcile when reconcileFog skips because fog is deleting', async () => { + const task = { id: 16, fogUuid: 'fog-5', reason: 'manual-retry', attempts: 0 } + const entity = { destroy: $sandbox.stub().resolves(1) } + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Progressing' }) + $sandbox.stub(FogPlatformService, 'reconcileFog').resolves({ skipped: true, reason: 'deleting' }) + $sandbox.stub(FogPlatformService, 'reconcileFogDelete').resolves({ fogUuid: 'fog-5', deleted: true }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + stubRunInTransaction($sandbox) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformService.reconcileFog).to.have.been.calledOnceWith('fog-5') + expect(FogPlatformService.reconcileFogDelete).to.have.been.calledOnceWith('fog-5') + }) + + it('keeps Deleting phase when delete reconcile fails', async () => { + const task = { id: 17, fogUuid: 'fog-6', reason: 'delete', attempts: 1 } + const error = new Error('nats cleanup failed') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformService, 'reconcileFogDelete').rejects(error) + $sandbox.stub(FogPlatformReconcileTaskManager, 'recordFogTaskFailure').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() + $sandbox.stub(FogPlatformService, 'markReconcileFailed') + stubRunInTransaction($sandbox) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformStatusManager.setPhase).to.have.been.calledOnceWith( + 'fog-6', + 'Deleting', + { lastError: 'nats cleanup failed' }, + sinon.match.any + ) + expect(FogPlatformService.markReconcileFailed).to.not.have.been.called + }) + it('records failure and updates fog status when reconcile throws', async () => { const task = { id: 13, fogUuid: 'fog-3', reason: 'spec-changed', attempts: 2 } const error = new Error('router create failed') $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Progressing' }) $sandbox.stub(FogPlatformService, 'reconcileFog').rejects(error) $sandbox.stub(FogPlatformReconcileTaskManager, 'recordFogTaskFailure').resolves(task) $sandbox.stub(FogPlatformService, 'markReconcileFailed').resolves() - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + stubRunInTransaction($sandbox) await PlatformReconcileWorkerJob.processNextFogTask() @@ -155,14 +216,14 @@ describe('platform-reconcile-worker-job', () => { provisioningStatus: 'ready' }) $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + stubRunInTransaction($sandbox, transaction) await PlatformReconcileWorkerJob.processNextServiceTask() expect(ServicePlatformService.reconcileService).to.have.been.calledOnceWith('api-gateway', task) expect(entity.destroy).to.have.been.calledOnceWith({ where: { id: 21 }, - transaction + transaction: sinon.match.any }) }) @@ -188,7 +249,7 @@ describe('platform-reconcile-worker-job', () => { $sandbox.stub(ServicePlatformService, 'reconcileService').rejects(error) $sandbox.stub(ServicePlatformReconcileTaskManager, 'recordServiceTaskFailure').resolves(task) $sandbox.stub(ServiceManager, 'update').resolves() - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + stubRunInTransaction($sandbox) await PlatformReconcileWorkerJob.processNextServiceTask() @@ -214,7 +275,7 @@ describe('platform-reconcile-worker-job', () => { $sandbox.stub(ServicePlatformService, 'reconcileService').rejects(error) $sandbox.stub(ServicePlatformReconcileTaskManager, 'recordServiceTaskFailure').resolves(task) $sandbox.stub(ServiceManager, 'update').resolves() - $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + stubRunInTransaction($sandbox) await PlatformReconcileWorkerJob.processNextServiceTask() diff --git a/test/src/middlewares/rvaluesVarSubstitionMiddleware.test.js b/test/src/middlewares/rvaluesVarSubstitionMiddleware.test.js index b0263051..c1d097f5 100755 --- a/test/src/middlewares/rvaluesVarSubstitionMiddleware.test.js +++ b/test/src/middlewares/rvaluesVarSubstitionMiddleware.test.js @@ -144,13 +144,21 @@ describe('rvaluesVarSubstitionMiddleware', () => { host: 'myhost01', })) + beforeEach(() => { + const Transaction = require('sequelize/lib/transaction') + $sandbox.matchTransaction = sinon.match.instanceOf(Transaction) + }) + it('performs variable substitutions and applies filter', async () => { await $subject expect($nextfct).to.have.been.called expect(FogService.getFogEndPoint).to.have.been.called expect(FogService.getFogEndPoint).to.have.been.calledWith({ uuid: 'TkLh8wzcxb86CRnHQyJkx6VF468JFd4f' }, false) expect(ApplicationManager.findOnePopulated).to.have.been.calledOnce - expect(ApplicationManager.findOnePopulated).to.have.been.calledWith({ exclude: ['created_at', 'updated_at'] }, { fakeTransaction: true }) + expect(ApplicationManager.findOnePopulated).to.have.been.calledWith( + { exclude: ['created_at', 'updated_at'] }, + $sandbox.matchTransaction + ) expect(MicroservicesService.listMicroservicesEndPoint).to.have.been.called expect(MicroservicesService.listMicroservicesEndPoint).to.have.been.calledWith({ applicationName: $redisAppName }, false) diff --git a/test/src/services/auth-bootstrap-service.test.js b/test/src/services/auth-bootstrap-service.test.js index 2917f5b7..a06a702e 100644 --- a/test/src/services/auth-bootstrap-service.test.js +++ b/test/src/services/auth-bootstrap-service.test.js @@ -82,10 +82,9 @@ function restoreDbModels (snapshot) { function installBootstrapDb (sandbox, state) { const db = require('../../../src/data/models') + const transactionRunner = require('../../../src/helpers/transaction-runner') - db.sequelize = { - transaction: sandbox.stub().callsFake(async () => createNoopTransaction()) - } + sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn) => fn(createNoopTransaction())) db.AuthGroup = { findOrCreate: sandbox.stub().callsFake(async ({ where, defaults }) => { diff --git a/test/src/services/auth-interaction-service.test.js b/test/src/services/auth-interaction-service.test.js index 45c855dd..335a1da7 100644 --- a/test/src/services/auth-interaction-service.test.js +++ b/test/src/services/auth-interaction-service.test.js @@ -149,7 +149,7 @@ describe('Auth interaction service', () => { const req = { headers: {} } const res = {} - const completeResult = await AuthInteractionService.complete($interactionUid, req, res, false) + const completeResult = await AuthInteractionService.complete($interactionUid, req, res) expect(completeResult.step).to.equal('complete') }) @@ -167,7 +167,7 @@ describe('Auth interaction service', () => { }, false) try { - await AuthInteractionService.complete($interactionUid, { headers: {} }, {}, false) + await AuthInteractionService.complete($interactionUid, { headers: {} }, {}) expect.fail('expected completion to fail') } catch (error) { expect(error).to.be.instanceOf(Errors.ValidationError) @@ -266,7 +266,7 @@ describe('Auth interaction service', () => { const req = { headers: {} } const res = {} - const completeResult = await AuthInteractionService.complete($interactionUid, req, res, false) + const completeResult = await AuthInteractionService.complete($interactionUid, req, res) expect(completeResult.step).to.equal('complete') expect(completeResult.redirectTo).to.equal('https://controller.test/oidc/auth/test/resume') diff --git a/test/src/services/controller-ms-service.test.js b/test/src/services/controller-ms-service.test.js index dfa2f2f9..c5841bc7 100644 --- a/test/src/services/controller-ms-service.test.js +++ b/test/src/services/controller-ms-service.test.js @@ -281,6 +281,17 @@ describe('Controller MS Service', () => { expect(MicroservicesService.updateChangeTracking).to.have.been.calledWith(true, fogUuid, transaction) }) + it('validates ports after clearing existing mappings on update', async () => { + await $subject + expect(MicroservicePortService.deletePortMappings).to.have.been.calledBefore( + MicroservicePortService.validatePortMappings + ) + expect(MicroservicePortService.validatePortMappings).to.have.been.calledWith( + { ports: registerData.ports, iofogUuid: fogUuid }, + transaction + ) + }) + context('when container workload fields are sent on update', () => { def('body', () => ({ ...registerData, diff --git a/test/src/services/fog-platform-service.test.js b/test/src/services/fog-platform-service.test.js index 5e87642e..ecb93053 100644 --- a/test/src/services/fog-platform-service.test.js +++ b/test/src/services/fog-platform-service.test.js @@ -11,6 +11,7 @@ const NatsInstanceManager = require('../../../src/data/managers/nats-instance-ma const NatsConnectionManager = require('../../../src/data/managers/nats-connection-manager') const IofogService = require('../../../src/services/iofog-service') const NatsService = require('../../../src/services/nats-service') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') const RouterService = require('../../../src/services/router-service') const ServiceBridgeConfig = require('../../../src/services/service-bridge-config') const ChangeTrackingService = require('../../../src/services/change-tracking-service') @@ -19,6 +20,7 @@ const MicroserviceService = require('../../../src/services/microservices-service const ApplicationManager = require('../../../src/data/managers/application-manager') const SecretManager = require('../../../src/data/managers/secret-manager') const FogPublicKeyManager = require('../../../src/data/managers/iofog-public-key-manager') +const transactionRunner = require('../../../src/helpers/transaction-runner') describe('Fog platform service', () => { def('sandbox', () => sinon.createSandbox()) @@ -27,6 +29,17 @@ describe('Fog platform service', () => { afterEach(() => $sandbox.restore()) + function stubPhasedRunInTransaction (sandbox, options = {}) { + const labels = options.labels || null + sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn, runOptions = {}) => { + if (labels && runOptions.label) { + labels.push(runOptions.label) + } + return fn(transaction) + }) + return labels + } + describe('.validateSystemFogInvariants()', () => { it('rejects non-interior router mode for system fog', () => { try { @@ -73,6 +86,7 @@ describe('Fog platform service', () => { } beforeEach(() => { + stubPhasedRunInTransaction($sandbox) $sandbox.stub(FogManager, 'findOneWithTags').resolves({ ...fog }) $sandbox.stub(FogManager, 'findOne').resolves({ ...fog }) $sandbox.stub(FogManager, 'update').resolves() @@ -94,9 +108,9 @@ describe('Fog platform service', () => { $sandbox.stub(NatsInstanceManager, 'findByFog').resolves({ id: 5, isLeaf: true }) $sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([]) $sandbox.stub(IofogService, '_handleRouterCertificates').resolves() - $sandbox.stub(NatsService, 'ensureNatsForFog').resolves() - $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() - $sandbox.stub(NatsService, 'enqueueReconcileTask').resolves() + $sandbox.stub(NatsService, 'ensureNatsForFogPhased').resolves({}) + $sandbox.stub(NatsService, 'cleanupNatsForFogPhased').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueNats').resolves() $sandbox.stub(RouterService, 'validateAndReturnUpstreamRouters').resolves([]) $sandbox.stub(RouterService, 'updateRouter').resolves(router) $sandbox.stub(IofogService, '_getRouterMicroserviceConfig').resolves({ bridges: { tcpListeners: {}, tcpConnectors: {} } }) @@ -108,7 +122,7 @@ describe('Fog platform service', () => { it('skips reconcile when platform phase is Deleting', async () => { FogPlatformStatusManager.getParsedStatus.resolves({ fogUuid, phase: 'Deleting', observedGeneration: 1 }) - const result = await FogPlatformService.reconcileFog(fogUuid, transaction) + const result = await FogPlatformService.reconcileFog(fogUuid) expect(result).to.eql({ skipped: true, reason: 'deleting' }) expect(FogPlatformStatusManager.setPhase).to.not.have.been.called @@ -116,10 +130,10 @@ describe('Fog platform service', () => { }) it('runs ordered reconcile steps and marks platform Ready', async () => { - const result = await FogPlatformService.reconcileFog(fogUuid, transaction) + const result = await FogPlatformService.reconcileFog(fogUuid) expect(IofogService._handleRouterCertificates).to.have.been.calledOnce - expect(NatsService.ensureNatsForFog).to.have.been.calledOnce + expect(NatsService.ensureNatsForFogPhased).to.have.been.calledOnce expect(RouterService.updateRouter).to.have.been.calledOnce expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledOnce expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith( @@ -137,31 +151,51 @@ describe('Fog platform service', () => { }) it('is safe to reconcile the same generation twice', async () => { - await FogPlatformService.reconcileFog(fogUuid, transaction) - await FogPlatformService.reconcileFog(fogUuid, transaction) + await FogPlatformService.reconcileFog(fogUuid) + await FogPlatformService.reconcileFog(fogUuid) expect(RouterService.updateRouter).to.have.been.calledTwice expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledTwice }) - it('accepts worker call shape (fogUuid only) with decorator fakeTransaction outside test mode', async () => { - const appHelperPath = require.resolve('../../../src/helpers/app-helper') - const decoratorPath = require.resolve('../../../src/decorators/transaction-decorator') - const fogPlatformServicePath = require.resolve('../../../src/services/fog-platform-service') - - $sandbox.stub(require(appHelperPath), 'isTest').returns(false) - delete require.cache[decoratorPath] - delete require.cache[fogPlatformServicePath] - const FreshFogPlatformService = require('../../../src/services/fog-platform-service') + it('uses phased runInTransaction labels from worker call shape (fogUuid only)', async () => { + const labels = [] + transactionRunner.runInTransaction.callsFake(async (fn, runOptions = {}) => { + if (runOptions.label) { + labels.push(runOptions.label) + } + return fn(transaction) + }) FogPlatformStatusManager.getParsedStatus.resolves({ fogUuid, phase: 'Deleting', observedGeneration: 1 }) - const result = await FreshFogPlatformService.reconcileFog(fogUuid) + const result = await FogPlatformService.reconcileFog(fogUuid) expect(result).to.eql({ skipped: true, reason: 'deleting' }) - expect(FogManager.findOneWithTags).to.have.been.calledWith( - { uuid: fogUuid }, - sinon.match({ fakeTransaction: true }) + expect(labels).to.deep.equal(['fogPlatform.prepare']) + }) + + it('runs cert, nats, platform, and finalize in separate transaction phases', async () => { + const labels = [] + transactionRunner.runInTransaction.callsFake(async (fn, runOptions = {}) => { + if (runOptions.label) { + labels.push(runOptions.label) + } + return fn(transaction) + }) + + await FogPlatformService.reconcileFog(fogUuid) + + expect(labels).to.deep.equal([ + 'fogPlatform.prepare', + 'fogPlatform.certPrep', + 'fogPlatform.platform', + 'fogPlatform.finalize' + ]) + expect(NatsService.ensureNatsForFogPhased).to.have.been.calledOnce + expect(NatsService.ensureNatsForFogPhased).to.have.been.calledWith( + sinon.match.any, + sinon.match.any ) }) @@ -170,9 +204,9 @@ describe('Fog platform service', () => { .onCall(0).resolves(null) .onCall(1).resolves({ id: 5, isLeaf: true }) - await FogPlatformService.reconcileFog(fogUuid, transaction) + await FogPlatformService.reconcileFog(fogUuid) - expect(NatsService.enqueueReconcileTask).to.have.been.calledWithMatch({ + expect(ReconcileOutboxManager.enqueueNats).to.have.been.calledWithMatch({ reason: 'cluster-routes-changed', fogUuids: [fogUuid] }, transaction) diff --git a/test/src/services/iofog-service.test.js b/test/src/services/iofog-service.test.js index 9f8199d7..41c0eb5d 100644 --- a/test/src/services/iofog-service.test.js +++ b/test/src/services/iofog-service.test.js @@ -27,7 +27,7 @@ const Errors = require('../../../src/helpers/errors') const config = require('../../../src/config') const FogPlatformSpecManager = require('../../../src/data/managers/fog-platform-spec-manager') const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') -const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') const isCLI = false const transaction = {} @@ -89,7 +89,7 @@ function stubCreateFogDeps (sandbox, { uuid = 'testUuid', existingFogs = [{ uuid sandbox.stub(ioFogService, '_handleRouterCertificates').resolves() sandbox.stub(FogPlatformSpecManager, 'upsertSpec').resolves({ fogUuid: uuid, generation: 1 }) sandbox.stub(FogPlatformStatusManager, 'ensurePending').resolves() - sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() } function stubUpdateFogDeps (sandbox, oldFog) { @@ -113,7 +113,7 @@ function stubUpdateFogDeps (sandbox, oldFog) { sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves(null) sandbox.stub(FogPlatformSpecManager, 'upsertSpec').resolves({ fogUuid: oldFog.uuid, generation: 2 }) sandbox.stub(FogPlatformStatusManager, 'ensurePending').resolves() - sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() } describe('ioFog Service', () => { @@ -172,7 +172,7 @@ describe('ioFog Service', () => { natsMode: 'leaf' }) expect(FogPlatformStatusManager.ensurePending).to.have.been.calledWith(uuid, transaction) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: uuid, reason: 'spec-changed', specGeneration: 1 @@ -295,7 +295,7 @@ describe('ioFog Service', () => { await $subject expect(FogPlatformSpecManager.upsertSpec).to.have.been.calledOnce expect(FogPlatformStatusManager.ensurePending).to.have.been.calledWith(uuid, transaction) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: uuid, reason: 'spec-changed', specGeneration: 2 @@ -416,7 +416,7 @@ describe('ioFog Service', () => { $sandbox.stub(ioFogManager, 'delete').resolves() $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() - $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() }) it('marks fog deleting and enqueues async teardown', async () => { @@ -424,7 +424,7 @@ describe('ioFog Service', () => { expect(Validator.validate).to.have.been.calledWith(fogData, Validator.schemas.iofogDelete) expect(result).to.eql({ uuid }) expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith(uuid, 'Deleting', {}, transaction) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: uuid, reason: 'delete' }, transaction) @@ -459,7 +459,7 @@ describe('ioFog Service', () => { generation: 4, spec: { routerMode: 'edge', natsMode: 'leaf' } }) - $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() }) it('resets failed platform status and enqueues manual retry', async () => { @@ -471,7 +471,7 @@ describe('ioFog Service', () => { { lastError: null }, transaction ) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: uuid, reason: 'manual-retry', specGeneration: 4 diff --git a/test/src/services/microservice-port.test.js b/test/src/services/microservice-port.test.js index 4527861c..d706ecaa 100644 --- a/test/src/services/microservice-port.test.js +++ b/test/src/services/microservice-port.test.js @@ -22,7 +22,7 @@ describe('Microservice Port Service', () => { const iofogUuid = 'fog-uuid' const agent = { uuid: iofogUuid, - getMicroservice: () => Promise.resolve([]) + getMicroservice: sinon.stub().resolves([]) } def('microserviceData', () => ({ @@ -42,4 +42,47 @@ describe('Microservice Port Service', () => { ) }) }) + + describe('.validatePortMappings() duplicate check', () => { + const transaction = { id: 'tx-1' } + const iofogUuid = 'fog-uuid' + const occupiedPort = 8080 + let agent + let microserviceOnAgent + + def('microserviceData', () => ({ + iofogUuid, + ports: [{ internal: 80, external: occupiedPort }] + })) + def('subject', () => MicroservicePortService.validatePortMappings($microserviceData, transaction)) + + beforeEach(() => { + microserviceOnAgent = { + uuid: 'other-ms-uuid', + getPorts: sinon.stub().resolves([{ portExternal: occupiedPort }]) + } + agent = { + uuid: iofogUuid, + getMicroservice: sinon.stub().resolves([microserviceOnAgent]) + } + $sandbox.stub(ioFogManager, 'findOne').resolves(agent) + }) + + it('passes the caller transaction to association reads', async () => { + try { + await $subject + } catch (error) { + // expected when port is taken + } + expect(agent.getMicroservice).to.have.been.calledWith({ transaction }) + expect(microserviceOnAgent.getPorts).to.have.been.calledWith({ transaction }) + }) + + it('rejects when external port is already allocated on the agent', () => { + return expect($subject).to.be.rejectedWith( + Errors.ValidationError, + /Port '8080' is not available/ + ) + }) + }) }) diff --git a/test/src/services/microservices-service.test.js b/test/src/services/microservices-service.test.js index ae547fe1..6d8a8ac7 100644 --- a/test/src/services/microservices-service.test.js +++ b/test/src/services/microservices-service.test.js @@ -521,7 +521,7 @@ describe('Microservices Service', () => { const result = await $subject expect(Validator.validate).to.have.been.calledWith(portMappingData, Validator.schemas.portsCreate) expect(MicroserviceManager.findMicroserviceOnGet).to.have.been.calledWith({ uuid: msvcUuid }, transaction) - expect(MicroservicePortService.validatePortMapping).to.have.been.calledWith(agent, portMappingData, {}, transaction) + expect(MicroservicePortService.validatePortMapping).to.have.been.calledWith(agent, portMappingData, transaction) expect(MicroservicePortService.createPortMapping).to.have.been.calledWith(microservice, portMappingData, transaction) expect(result).to.equal(createdMapping) }) diff --git a/test/src/services/nats-auth-service.test.js b/test/src/services/nats-auth-service.test.js index a665254f..576c165f 100644 --- a/test/src/services/nats-auth-service.test.js +++ b/test/src/services/nats-auth-service.test.js @@ -10,6 +10,7 @@ const ApplicationManager = require('../../../src/data/managers/application-manag const NatsOperatorManager = require('../../../src/data/managers/nats-operator-manager') const SecretService = require('../../../src/services/secret-service') const NatsService = require('../../../src/services/nats-service') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') const NatsAuthService = require('../../../src/services/nats-auth-service') const NatsSystemRules = require('../../../src/config/nats-system-rules') const { createOperator, createAccount } = require('@nats-io/nkeys') @@ -73,7 +74,7 @@ describe('NATS Auth Service', () => { return Promise.resolve(null) }) $sandbox.stub(NatsUserRuleManager, 'findOne').resolves(defaultUserRule) - $sandbox.stub(NatsService, 'enqueueReconcileTask').callsFake(() => Promise.resolve()) + $sandbox.stub(ReconcileOutboxManager, 'enqueueNats').callsFake(() => Promise.resolve()) }) context('when existing user has same account and same rule (ensure-only)', () => { @@ -183,7 +184,7 @@ describe('NATS Auth Service', () => { createdUser = null $sandbox.stub(NatsAccountRuleManager, 'updateOrCreate').resolves() $sandbox.stub(NatsUserRuleManager, 'updateOrCreate').resolves() - $sandbox.stub(NatsService, 'enqueueReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueNats').resolves() $sandbox.stub(NatsOperatorManager, 'findOne').resolves(operator) $sandbox.stub(SecretService, 'getSecretEndpoint').callsFake((secretName) => { if (secretName === operator.seedSecretName) { diff --git a/test/src/services/nats-service.test.js b/test/src/services/nats-service.test.js index 36b0a93b..9258f955 100644 --- a/test/src/services/nats-service.test.js +++ b/test/src/services/nats-service.test.js @@ -26,7 +26,7 @@ describe('NATS Service', () => { const natsInstance = { id: 77, isLeaf: true, isHub: false } const microservices = [{ uuid: 'ms-1' }] - def('subject', () => NatsService.cleanupNatsForFog(fog, transaction)) + def('subject', () => NatsService.cleanupNatsForFogDb(fog, transaction)) beforeEach(() => { $sandbox.stub(NatsInstanceManager, 'findByFog').returns(Promise.resolve(natsInstance)) @@ -221,4 +221,179 @@ describe('NATS Service', () => { expect(payload.nats.operator_service_urls).to.eql(['https://hub:4222']) }) }) + + describe('K8s I/O outside transactions (R-04–R-06)', () => { + const k8sClient = require('../../../src/utils/k8s-client') + const config = require('../../../src/config') + + function loadNatsServiceWithTxStub (runInTransactionImpl) { + const txRunnerPath = require.resolve('../../../src/helpers/transaction-runner') + const natsPath = require.resolve('../../../src/services/nats-service') + delete require.cache[natsPath] + delete require.cache[txRunnerPath] + const transactionRunner = require('../../../src/helpers/transaction-runner') + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(runInTransactionImpl) + return require('../../../src/services/nats-service') + } + + function stubKubernetesControlPlane () { + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'app.ControlPlane') return 'kubernetes' + if (key === 'nats.enabled') return false + return defaultValue + }) + } + + function stubCleanupDb (NatsServiceFresh, natsInstance) { + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves(natsInstance) + $sandbox.stub(NatsInstanceManager, 'findAll').resolves([]) + $sandbox.stub(NatsAccountManager, 'findOne').resolves({ id: 1, isSystem: true }) + $sandbox.stub(NatsUserManager, 'findOne').resolves({ credsSecretName: 'nats-creds-sys-admin' }) + $sandbox.stub(NatsConnectionManager, 'delete').resolves() + $sandbox.stub(NatsInstanceManager, 'delete').resolves() + $sandbox.stub(NatsAuthService, 'deleteServerSysUserForFog').resolves() + $sandbox.stub(MicroserviceManager, 'findAll').resolves([]) + $sandbox.stub(VolumeMappingManager, 'delete').resolves() + $sandbox.stub(VolumeMountService, 'unlinkVolumeMountEndpoint').resolves() + $sandbox.stub(VolumeMountService, 'findVolumeMountedFogNodes').resolves([]) + $sandbox.stub(VolumeMountService, 'deleteVolumeMountEndpoint').resolves() + $sandbox.stub(ConfigMapService, 'deleteConfigMapEndpoint').resolves() + $sandbox.stub(SecretService, 'deleteSecretEndpoint').resolves() + return NatsServiceFresh + } + + it('cleanupNatsForFog applies K8s patch and rollout after runInTransaction', async () => { + stubKubernetesControlPlane() + const fog = { uuid: 'fog-1', name: 'local-agent' } + const natsInstance = { id: 77, isLeaf: false, isHub: false } + const callOrder = [] + const txLabels = [] + + const NatsServiceFresh = loadNatsServiceWithTxStub(async (fn, runOptions = {}) => { + if (runOptions.label) { + txLabels.push(runOptions.label) + } + callOrder.push('tx-start') + const result = await fn({}) + callOrder.push('tx-end') + return result + }) + stubCleanupDb(NatsServiceFresh, natsInstance) + $sandbox.stub(k8sClient, 'getConfigMap').callsFake(async () => { + callOrder.push('k8s-get') + return { data: { 'server.conf': 'routes: []' } } + }) + $sandbox.stub(k8sClient, 'patchConfigMap').callsFake(async () => { + callOrder.push('k8s-patch') + }) + $sandbox.stub(k8sClient, 'rolloutStatefulSet').callsFake(async () => { + callOrder.push('k8s-rollout') + }) + + await NatsServiceFresh.cleanupNatsForFog(fog) + + expect(txLabels).to.deep.equal(['nats.cleanupForFog']) + expect(callOrder).to.deep.equal(['tx-start', 'tx-end', 'k8s-get', 'k8s-patch', 'k8s-rollout']) + }) + + it('cleanupNatsForFog reuses parent transaction when provided', async () => { + const fog = { uuid: 'fog-1', name: 'local-agent' } + const parentTx = { + commit: $sandbox.stub(), + rollback: $sandbox.stub(), + afterCommit: $sandbox.stub() + } + const txLabels = [] + const natsInstance = { id: 77, isLeaf: false, isHub: false } + + const NatsServiceFresh = loadNatsServiceWithTxStub(async (fn, runOptions = {}) => { + if (runOptions.label) { + txLabels.push(runOptions.label) + } + return fn({}) + }) + stubCleanupDb(NatsServiceFresh, natsInstance) + + await NatsServiceFresh.cleanupNatsForFog(fog, parentTx) + + expect(txLabels).to.deep.equal([]) + expect(parentTx.afterCommit).to.have.been.calledOnce + }) + + it('ensureNatsForFog uses phased cert-prep, auth-prep, and topology transaction labels', async () => { + const txLabels = [] + + const NatsServiceFresh = loadNatsServiceWithTxStub(async (fn, runOptions = {}) => { + if (runOptions.label) { + txLabels.push(runOptions.label) + } + if (runOptions.label === 'nats.ensure.certPrep') { + return { + serverCertName: 'nats-server-local-agent', + mqttCertName: 'nats-mqtt-server-local-agent', + jetstreamKey: { secretName: 'jsk', jsk: 'key' } + } + } + if (runOptions.label === 'nats.ensure.authPrep') { + return { + mode: 'leaf', + isHub: false, + isLeaf: true, + serverPort: 4222, + leafPort: 7422, + clusterPort: 6222, + mqttPort: 1883, + httpPort: 8222, + configMapName: 'nats-server-conf-local-agent', + configKey: 'server.conf', + template: 'leaf', + jwtBundleConfigMapName: 'nats-jwt-bundle-local-agent', + sysCredsSecretName: null + } + } + if (runOptions.label === 'nats.ensure.topology') { + return { microservice: { uuid: 'ms-1' }, k8sHubPatch: null } + } + return fn({}) + }) + + await NatsServiceFresh.ensureNatsForFog( + { uuid: 'fog-1', name: 'local-agent' }, + { mode: 'leaf' } + ) + + expect(txLabels).to.deep.equal(['nats.ensure.certPrep', 'nats.ensure.authPrep', 'nats.ensure.topology']) + }) + + it('reconcileResolverArtifacts applies JWT bundle K8s patch after runInTransaction', async () => { + stubKubernetesControlPlane() + const callOrder = [] + + const NatsServiceFresh = loadNatsServiceWithTxStub(async (fn) => { + callOrder.push('tx-start') + const result = await fn({}) + callOrder.push('tx-end') + return result + }) + + $sandbox.stub(require('../../../src/data/managers/iofog-manager'), 'findAll').resolves([]) + $sandbox.stub(require('../../../src/data/managers/application-manager'), 'findAll').resolves([]) + $sandbox.stub(NatsInstanceManager, 'findAll').resolves([]) + $sandbox.stub(NatsAccountManager, 'findOne').resolves({ id: 1, isSystem: true }) + $sandbox.stub(require('../../../src/services/nats-auth-service'), 'ensureSystemAccount').resolves() + $sandbox.stub(ConfigMapManager, 'getConfigMap').resolves(null) + $sandbox.stub(ConfigMapService, 'createConfigMapEndpoint').resolves({ name: 'iofog-nats-jwt-bundle' }) + $sandbox.stub(k8sClient, 'getConfigMap').callsFake(async () => { + callOrder.push('k8s-get') + return null + }) + $sandbox.stub(k8sClient, 'patchConfigMap').callsFake(async () => { + callOrder.push('k8s-patch') + }) + + await NatsServiceFresh.reconcileResolverArtifacts({ fogUuids: [] }) + + expect(callOrder).to.deep.equal(['tx-start', 'tx-end', 'k8s-get', 'k8s-patch']) + }) + }) }) diff --git a/test/src/services/rbac-service.test.js b/test/src/services/rbac-service.test.js index 1d12d9a5..8e6f015a 100644 --- a/test/src/services/rbac-service.test.js +++ b/test/src/services/rbac-service.test.js @@ -68,6 +68,7 @@ describe('Rbac Service', () => { describe('.updateRoleEndpoint()', () => { const roleName = 'custom-role' const roleId = 99 + const roleRef = { kind: 'Role', name: roleName } const roleData = { rules: [{ apiGroups: ['edgelet.iofog.org/v1'], @@ -87,11 +88,14 @@ describe('Rbac Service', () => { .onFirstCall().resolves({ id: roleId, name: roleName }) .onSecondCall().resolves({ id: roleId, name: roleName }) $sandbox.stub(RbacRoleManager, 'updateRole').resolves({ name: roleName }) - $sandbox.stub(RbacRoleBindingManager, 'findAll').resolves([]) + $sandbox.stub(RbacRoleBindingManager, 'findAll').resolves([ + { name: 'binding1', roleRef: roleRef } + ]) + $sandbox.stub(RbacRoleBindingManager, 'updateRoleBinding').resolves({}) $sandbox.stub(RbacServiceAccountManager, 'findAll').resolves([ - { name: 'sa1', microserviceUuid: 'msvc-1', applicationId: 1, roleRef: roleName }, - { name: 'sa2', microserviceUuid: 'msvc-2', applicationId: 1, roleRef: roleName }, - { name: 'sa3', microserviceUuid: 'msvc-3', applicationId: 2, roleRef: roleName } + { name: 'sa1', microserviceUuid: 'msvc-1', applicationId: 1, roleRef }, + { name: 'sa2', microserviceUuid: 'msvc-2', applicationId: 1, roleRef }, + { name: 'sa3', microserviceUuid: 'msvc-3', applicationId: 2, roleRef } ]) $sandbox.stub(ApplicationManager, 'findOne') .withArgs({ id: 1 }).resolves({ name: 'app1' }) @@ -118,6 +122,50 @@ describe('Rbac Service', () => { transaction ) }) + + it('refreshes roleRef.name on linked bindings and service accounts', async () => { + await $subject + expect(RbacRoleBindingManager.updateRoleBinding).to.have.been.calledOnceWith( + 'binding1', + { roleRef: { kind: 'Role', name: roleName } }, + transaction + ) + expect(RbacServiceAccountManager.updateServiceAccount).to.have.been.calledWith( + 'app1', + 'sa1', + { roleRef: { kind: 'Role', name: roleName } }, + transaction + ) + }) + + context('when the role is renamed', () => { + const renamedRole = 'custom-role-v2' + const renameRoleData = { name: renamedRole } + + def('subject', () => RbacService.updateRoleEndpoint(roleName, renameRoleData, transaction)) + + beforeEach(() => { + RbacRoleManager.findOne.restore() + $sandbox.stub(RbacRoleManager, 'findOne') + .onFirstCall().resolves({ id: roleId, name: roleName }) + .onSecondCall().resolves({ id: roleId, name: renamedRole }) + }) + + it('rewrites roleRef.name on linked bindings and service accounts', async () => { + await $subject + expect(RbacRoleBindingManager.updateRoleBinding).to.have.been.calledOnceWith( + 'binding1', + { roleRef: { kind: 'Role', name: renamedRole } }, + transaction + ) + expect(RbacServiceAccountManager.updateServiceAccount).to.have.been.calledWith( + 'app2', + 'sa3', + { roleRef: { kind: 'Role', name: renamedRole } }, + transaction + ) + }) + }) }) describe('.updateRoleBindingEndpoint()', () => { diff --git a/test/src/services/registry-service.test.js b/test/src/services/registry-service.test.js index 95bc9f28..c2e6e23a 100644 --- a/test/src/services/registry-service.test.js +++ b/test/src/services/registry-service.test.js @@ -9,6 +9,7 @@ const FogManager = require('../../../src/data/managers/iofog-manager') const ChangeTrackingService = require('../../../src/services/change-tracking-service') const MicroserviceManager = require('../../../src/data/managers/microservice-manager') const SecretHelper = require('../../../src/helpers/secret-helper') +const vaultManager = require('../../../src/vault/vault-manager') const ErrorMessages = require('../../../src/helpers/error-messages') const Errors = require('../../../src/helpers/errors') @@ -54,12 +55,13 @@ describe('Registry Service', () => { $sandbox.stub(Validator, 'validate').resolves(true) $sandbox.stub(AppHelper, 'deleteUndefinedFields').callsFake((value) => value) $sandbox.stub(RegistryManager, 'create').resolves(created) - $sandbox.stub(SecretHelper, 'encryptSecret').resolves('encrypted-password') + $sandbox.stub(SecretHelper, 'encryptSecretInternal').resolves('encrypted-password') + $sandbox.stub(SecretHelper, 'encryptSecret').resolves('vault-ref') $sandbox.stub(RegistryManager, 'update').resolves() stubChangeTrackingDeps($sandbox) }) - it('validates input, encrypts password, and returns registry id', async () => { + it('validates input, encrypts password internally in tx, and returns registry id', async () => { const result = await $subject expect(Validator.validate).to.have.been.calledWith(registryData, Validator.schemas.registryCreate) expect(RegistryManager.create).to.have.been.calledWithMatch({ @@ -67,11 +69,11 @@ describe('Registry Service', () => { username: registryData.username, userEmail: registryData.email }, transaction) - expect(SecretHelper.encryptSecret).to.have.been.calledWith( + expect(SecretHelper.encryptSecretInternal).to.have.been.calledWith( { value: registryData.password }, - 'registry-16', - 'registry' + 'registry-16' ) + expect(SecretHelper.encryptSecret).to.not.have.been.called expect(ChangeTrackingService.update).to.have.been.calledWith( 'fog-uuid', ChangeTrackingService.events.registries, @@ -95,6 +97,7 @@ describe('Registry Service', () => { it('skips password encryption', async () => { await $subject + expect(SecretHelper.encryptSecretInternal).to.not.have.been.called expect(SecretHelper.encryptSecret).to.not.have.been.called expect(RegistryManager.update).to.not.have.been.called }) @@ -231,6 +234,7 @@ describe('Registry Service', () => { context('when password is cleared and vault reference exists', () => { beforeEach(() => { + $sandbox.stub(vaultManager, 'isEnabled').returns(true) RegistryManager.findOne.resolves({ ...existing, password: 'vault:ref' }) $sandbox.stub(SecretHelper, 'isVaultReference').returns(true) $sandbox.stub(SecretHelper, 'deleteSecret').resolves() diff --git a/test/src/services/router-service.test.js b/test/src/services/router-service.test.js index a385075a..4a6f72e7 100644 --- a/test/src/services/router-service.test.js +++ b/test/src/services/router-service.test.js @@ -340,6 +340,39 @@ describe('Router Service', () => { return expect(true).to.eql(false) }) }) + + it('preserves existing bridges when regenerating router config', async () => { + const preservedBridges = { + tcpConnectors: { + 'api-connector': { + name: 'api-connector', + host: '127.0.0.1', + port: '8080', + address: 'api' + } + }, + tcpListeners: { + 'api-listener': { + name: 'api-listener', + port: '9001', + address: 'api' + } + } + } + MicroserviceManager.findOne.resolves({ + id: 1, + uuid: 'routerMsvcUuid', + iofogUuid: router.iofogUuid, + catalogItemId: routerCatalogItem.id, + config: JSON.stringify({ bridges: preservedBridges }) + }) + + await RouterService.updateConfig(routerID, containerEngine, transaction) + + expect(MicroserviceManager.update).to.have.been.called + const updatedConfig = JSON.parse(MicroserviceManager.update.firstCall.args[1].config) + expect(updatedConfig.bridges).to.eql(preservedBridges) + }) }) describe('.updateRouter', () => { diff --git a/test/src/services/service-bridge-config.test.js b/test/src/services/service-bridge-config.test.js index 949ebdfe..810d3f12 100644 --- a/test/src/services/service-bridge-config.test.js +++ b/test/src/services/service-bridge-config.test.js @@ -15,7 +15,7 @@ describe('Service bridge config', () => { afterEach(() => $sandbox.restore()) describe('.stripServiceDerivedBridges()', () => { - it('removes service-derived listeners and connectors while preserving router bridges', () => { + it('removes service-derived listeners while preserving connectors and router bridges', () => { const baseConfig = { bridges: { tcpListeners: { @@ -35,6 +35,7 @@ describe('Service bridge config', () => { 'fog-amqp': { name: 'fog-amqp', port: '5672', address: 'amqp' } }) expect(stripped.bridges.tcpConnectors).to.eql({ + 'api-connector': { name: 'api-connector', host: 'hub', port: '8080' }, 'upstream-router': { name: 'upstream-router', host: '10.0.0.2', port: '55671' } }) }) @@ -106,5 +107,40 @@ describe('Service bridge config', () => { expect(result.bridges.tcpListeners).to.eql({}) expect(MicroserviceManager.update).to.have.been.calledOnce }) + + it('preserves hub-managed tcpConnectors while rebuilding listeners', async () => { + const baseConfig = { + bridges: { + tcpListeners: { + 'stale-listener': { name: 'stale-listener', port: '8000', address: 'stale' } + }, + tcpConnectors: { + 'api-connector': { + name: 'api-connector', + host: '127.0.0.1', + port: '8080', + address: 'api', + processId: 'ms-uuid' + } + } + } + } + + const result = await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseConfig, transaction) + + expect(result.bridges.tcpListeners).to.eql({ + 'api-listener': { name: 'api-listener', port: '9001', address: 'api' }, + 'mqtt-listener': { name: 'mqtt-listener', port: '9002', address: 'mqtt' } + }) + expect(result.bridges.tcpConnectors).to.eql({ + 'api-connector': { + name: 'api-connector', + host: '127.0.0.1', + port: '8080', + address: 'api', + processId: 'ms-uuid' + } + }) + }) }) }) diff --git a/test/src/services/service-platform-service.test.js b/test/src/services/service-platform-service.test.js index 2ab1c874..fd4c3561 100644 --- a/test/src/services/service-platform-service.test.js +++ b/test/src/services/service-platform-service.test.js @@ -4,12 +4,13 @@ const sinon = require('sinon') const ServicePlatformService = require('../../../src/services/service-platform-service') const ServiceManager = require('../../../src/data/managers/service-manager') const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') -const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') const HubRouterConfigLockManager = require('../../../src/data/managers/hub-router-config-lock-manager') const RouterManager = require('../../../src/data/managers/router-manager') const ServicesService = require('../../../src/services/services-service') const K8sClient = require('../../../src/utils/k8s-client') const config = require('../../../src/config') +const transactionRunner = require('../../../src/helpers/transaction-runner') describe('Service platform service', () => { def('sandbox', () => sinon.createSandbox()) @@ -58,6 +59,14 @@ describe('Service platform service', () => { } beforeEach(() => { + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn, options = {}) => { + const result = await fn(transaction) + if (options.label === 'servicePlatform.hubReconcile') { + expect(K8sClient.getConfigMap).to.not.have.been.called + expect(K8sClient.patchConfigMap).to.not.have.been.called + } + return result + }) $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { if (key === 'app.uuid') { return 'controller-uuid-1' @@ -92,31 +101,30 @@ describe('Service platform service', () => { } }) $sandbox.stub(K8sClient, 'patchConfigMap').resolves() - $sandbox.stub(ServicesService, '_updateK8sService').resolves() - $sandbox.stub(K8sClient, 'watchLoadBalancerIP').resolves('203.0.113.10') + $sandbox.stub(ServicesService, '_syncK8sServiceResource').resolves('203.0.113.10') $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a']) - $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves({ id: 1 }) $sandbox.stub(ServiceManager, 'findOneWithTags').resolves({ ...service, tags: [...service.tags] }) $sandbox.stub(ServiceManager, 'update').resolves() $sandbox.stub(ServicePlatformReconcileTaskManager, 'delete').resolves() }) it('runs hub reconcile, fan-out, and marks provisioning ready', async () => { - const result = await ServicePlatformService.reconcileService(serviceName, task, transaction) + const result = await ServicePlatformService.reconcileService(serviceName, task) expect(HubRouterConfigLockManager.tryAcquire).to.have.been.calledOnce - expect(K8sClient.patchConfigMap).to.have.been.called - expect(ServicesService._updateK8sService).to.have.been.calledOnce - expect(K8sClient.watchLoadBalancerIP).to.have.been.calledOnce + expect(K8sClient.getConfigMap).to.have.been.calledOnce + expect(K8sClient.patchConfigMap).to.have.been.calledOnce + expect(ServicesService._syncK8sServiceResource).to.have.been.calledOnce expect(HubRouterConfigLockManager.release).to.have.been.calledOnce - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: 'fog-a', reason: 'service-changed' - }, transaction) + }, sinon.match.any) expect(ServiceManager.update).to.have.been.calledWith( { name: serviceName }, { provisioningStatus: 'ready', provisioningError: null }, - transaction + sinon.match.any ) expect(result.provisioningStatus).to.equal('ready') }) @@ -136,27 +144,27 @@ describe('Service platform service', () => { }) ServicesService.handleServiceDistribution.resolves(['fog-a', 'fog-b', 'fog-c']) - await ServicePlatformService.reconcileService(serviceName, tagChangeTask, transaction) + await ServicePlatformService.reconcileService(serviceName, tagChangeTask) expect(ServicesService.handleServiceDistribution).to.have.been.calledWith( ['site-a', 'site-b'], - transaction + sinon.match.any ) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.callCount(3) + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.callCount(3) }) it('is safe to reconcile the same service twice', async () => { - await ServicePlatformService.reconcileService(serviceName, task, transaction) - await ServicePlatformService.reconcileService(serviceName, task, transaction) + await ServicePlatformService.reconcileService(serviceName, task) + await ServicePlatformService.reconcileService(serviceName, task) - expect(K8sClient.patchConfigMap.callCount).to.be.at.least(4) + expect(K8sClient.patchConfigMap.callCount).to.equal(2) }) it('throws when LoadBalancer IP watch times out', async () => { - K8sClient.watchLoadBalancerIP.resolves(null) + ServicesService._syncK8sServiceResource.resolves(null) try { - await ServicePlatformService.reconcileService(serviceName, task, transaction) + await ServicePlatformService.reconcileService(serviceName, task) throw new Error('expected reconcile to fail') } catch (error) { expect(error.message).to.include('LoadBalancer IP not assigned') @@ -166,7 +174,7 @@ describe('Service platform service', () => { expect(ServiceManager.update).to.not.have.been.calledWith( { name: serviceName }, { provisioningStatus: 'ready', provisioningError: null }, - transaction + sinon.match.any ) }) }) @@ -191,6 +199,7 @@ describe('Service platform service', () => { } beforeEach(() => { + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn) => fn(transaction)) $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { if (key === 'app.uuid') { return 'controller-uuid-1' @@ -212,33 +221,35 @@ describe('Service platform service', () => { $sandbox.stub(K8sClient, 'patchConfigMap').resolves() $sandbox.stub(ServicesService, '_deleteK8sService').resolves() $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a', 'fog-b']) - $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves({ id: 1 }) $sandbox.stub(ServicePlatformReconcileTaskManager, 'delete').resolves() $sandbox.stub(ServiceManager, 'findOneWithTags') $sandbox.stub(ServiceManager, 'update') }) it('uses spec_snapshot for hub teardown, fan-out, and destroys the task', async () => { - const result = await ServicePlatformService.reconcileService(serviceName, deleteTask, transaction) + const result = await ServicePlatformService.reconcileService(serviceName, deleteTask) expect(ServiceManager.findOneWithTags).to.not.have.been.called - expect(K8sClient.patchConfigMap).to.have.been.calledTwice + expect(K8sClient.getConfigMap).to.have.been.calledOnce + expect(K8sClient.patchConfigMap).to.have.been.calledOnce + const patchData = K8sClient.patchConfigMap.firstCall.args[1] + const routerConfig = JSON.parse(patchData.data['skrouterd.json']) + expect(routerConfig).to.eql([]) expect(ServicesService._deleteK8sService).to.have.been.calledWith(serviceName) expect(ServicesService.handleServiceDistribution).to.have.been.calledWith( ['site-a', 'site-b'], - transaction + sinon.match.any ) - expect(ServicePlatformReconcileTaskManager.delete).to.have.been.calledWith({ id: 99 }, transaction) + expect(ServicePlatformReconcileTaskManager.delete).to.have.been.calledWith({ id: 99 }, sinon.match.any) expect(ServiceManager.update).to.not.have.been.called expect(result.isDelete).to.equal(true) }) }) describe('.acquireHubLockWithTimeout()', () => { - let clock - beforeEach(() => { - clock = sinon.useFakeTimers() + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn) => fn(transaction)) $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { if (key === 'settings.hubRouterConfigLockTimeoutSeconds') { return 1 @@ -248,16 +259,11 @@ describe('Service platform service', () => { $sandbox.stub(HubRouterConfigLockManager, 'tryAcquire').resolves(false) }) - afterEach(() => { - clock.restore() - }) - - it('times out when hub lock is held by another controller', async () => { - const acquirePromise = ServicePlatformService.acquireHubLockWithTimeout('controller-uuid-1', transaction) - await clock.runAllAsync() + it('times out when hub lock is held by another controller', async function () { + this.timeout(5000) try { - await acquirePromise + await ServicePlatformService.acquireHubLockWithTimeout('controller-uuid-1') throw new Error('expected lock acquire to fail') } catch (error) { expect(error.message).to.include('Timed out waiting for hub router ConfigMap lock') @@ -270,15 +276,15 @@ describe('Service platform service', () => { describe('.fanOutFogReconcile()', () => { beforeEach(() => { $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a', 'fog-b']) - $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves({ id: 1 }) }) it('enqueues fog platform reconcile tasks for distributed fogs', async () => { const fogUuids = await ServicePlatformService.fanOutFogReconcile(['site-a'], transaction) expect(fogUuids).to.eql(['fog-a', 'fog-b']) - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledTwice - expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledTwice + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ fogUuid: 'fog-a', reason: 'service-changed' }, transaction) diff --git a/test/src/services/services-connector-host.test.js b/test/src/services/services-connector-host.test.js index 2195bda1..d2d4478b 100644 --- a/test/src/services/services-connector-host.test.js +++ b/test/src/services/services-connector-host.test.js @@ -401,122 +401,6 @@ describe('services-service connector host', () => { }) }) - describe('_addTcpConnector()', () => { - def('fogUuid', () => 'fog-edge') - def('serviceConfig', () => ({ - type: 'microservice', - name: 'bridge-svc', - resource: 'ms-1', - targetPort: 8080 - })) - def('capture', () => ({ config: null })) - - beforeEach(() => { - delete process.env.CONTROL_PLANE - $sandbox.stub(RouterManager, 'findOne').callsFake((where) => { - if (where.iofogUuid === $fogUuid) { - return Promise.resolve({ isEdge: true }) - } - return Promise.resolve(null) - }) - $sandbox.stub(FogManager, 'findOne').callsFake((where) => { - if (where.uuid === $fogUuid) { - return Promise.resolve({ uuid: $fogUuid, name: 'edge-fog' }) - } - return Promise.resolve(null) - }) - $sandbox.stub(ApplicationManager, 'findOne').callsFake((where) => { - if (where.id === 42) { - return Promise.resolve({ id: 42, name: 'myapp' }) - } - return Promise.resolve({ id: 10, name: 'system-edge-fog', isSystem: true }) - }) - $sandbox.stub(MicroserviceManager, 'findOne').callsFake((where) => { - if (where.uuid === 'ms-1') { - return Promise.resolve({ - uuid: 'ms-1', - name: 'worker', - applicationId: 42, - hostNetworkMode: false, - iofogUuid: $fogUuid - }) - } - if (where.name === 'router' && where.applicationId === 10) { - return Promise.resolve({ uuid: 'router-ms-1', config: '{}' }) - } - return Promise.resolve(null) - }) - $sandbox.stub(MicroserviceManager, 'update').callsFake((where, data) => { - if (data.config) { - $capture.config = JSON.parse(data.config) - } - return Promise.resolve() - }) - $sandbox.stub(ChangeTrackingService, 'update').resolves() - }) - - it('persists connector on the microservice fog router without siteId', async () => { - await ServicesService._addTcpConnector($serviceConfig, $transaction) - - const connector = $capture.config.bridges.tcpConnectors['bridge-svc-connector'] - expect(connector).to.include({ - name: 'bridge-svc-connector', - host: 'myapp.worker', - port: '8080', - address: 'bridge-svc', - processId: 'ms-1' - }) - expect(connector).to.not.have.property('siteId') - }) - }) - - describe('_addTcpConnector() default router target', () => { - def('capture', () => ({ config: null })) - - beforeEach(() => { - delete process.env.CONTROL_PLANE - $sandbox.stub(RouterManager, 'findOne').callsFake((where) => { - if (where.isDefault === true) { - return Promise.resolve({ iofogUuid: 'default-fog' }) - } - return Promise.resolve(null) - }) - $sandbox.stub(FogManager, 'findOne').callsFake((where) => { - if (where.uuid === 'default-fog') { - return Promise.resolve({ uuid: 'default-fog', name: 'default' }) - } - return Promise.resolve(null) - }) - $sandbox.stub(ApplicationManager, 'findOne').resolves({ id: 99, name: 'system-default', isSystem: true }) - $sandbox.stub(MicroserviceManager, 'findOne').callsFake((where) => { - if (where.name === 'router') { - return Promise.resolve({ uuid: 'router-default', config: '{}' }) - } - return Promise.resolve(null) - }) - $sandbox.stub(MicroserviceManager, 'update').callsFake((where, data) => { - if (data.config) { - $capture.config = JSON.parse(data.config) - } - return Promise.resolve() - }) - $sandbox.stub(ChangeTrackingService, 'update').resolves() - }) - - it('persists connector on default router for external services without siteId', async () => { - await ServicesService._addTcpConnector({ - type: 'external', - name: 'ext-svc', - resource: 'example.com', - targetPort: 443 - }, $transaction) - - const connector = $capture.config.bridges.tcpConnectors['ext-svc-connector'] - expect(connector.host).to.equal('example.com') - expect(connector).to.not.have.property('siteId') - }) - }) - describe('iofog _buildTcpListenerForFog()', () => { it('builds listener without siteId', () => { const listener = ioFogService._buildTcpListenerForFog({ diff --git a/test/src/services/services-service.test.js b/test/src/services/services-service.test.js index 6785ae45..89f9b7d8 100644 --- a/test/src/services/services-service.test.js +++ b/test/src/services/services-service.test.js @@ -5,11 +5,12 @@ const ServiceController = require('../../../src/controllers/service-controller') const YamlParserService = require('../../../src/services/yaml-parser-service') const ServicesService = require('../../../src/services/services-service') const ServiceManager = require('../../../src/data/managers/service-manager') -const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') const RouterManager = require('../../../src/data/managers/router-manager') const TagsManager = require('../../../src/data/managers/tags-manager') const Validator = require('../../../src/schemas') const Errors = require('../../../src/helpers/errors') +const K8sClient = require('../../../src/utils/k8s-client') describe('services-service platform reconcile enqueue', () => { def('sandbox', () => sinon.createSandbox()) @@ -41,12 +42,28 @@ describe('services-service platform reconcile enqueue', () => { return service } + function buildSequelizeLikeService (fields = {}) { + const data = buildServiceModel(fields) + return Object.create({ + get name () { return data.name }, + get type () { return data.type }, + get resource () { return data.resource }, + get defaultBridge () { return data.defaultBridge }, + get bridgePort () { return data.bridgePort }, + get targetPort () { return data.targetPort }, + get servicePort () { return data.servicePort }, + get k8sType () { return data.k8sType }, + get serviceEndpoint () { return data.serviceEndpoint }, + get tags () { return data.tags } + }) + } + function stubCreateDeps () { delete process.env.CONTROL_PLANE $sandbox.stub(Validator, 'validate').resolves(true) $sandbox.stub(ServiceManager, 'findAll').resolves([]) $sandbox.stub(ServiceManager, 'create').callsFake((data) => Promise.resolve(buildServiceModel(data))) - $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueServicePlatform').resolves() $sandbox.stub(RouterManager, 'findOne').resolves({ isDefault: true, host: 'hub.example.com', @@ -79,7 +96,7 @@ describe('services-service platform reconcile enqueue', () => { expect(createPayload.provisioningStatus).to.equal('pending') expect(createPayload.provisioningError).to.be.null - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'spec-changed', specSnapshot: { @@ -104,14 +121,10 @@ describe('services-service platform reconcile enqueue', () => { }) it('does not run hub provisioning on the synchronous path', async () => { - $sandbox.stub(ServicesService, '_addTcpConnector').resolves() - $sandbox.stub(ServicesService, '_addTcpListener').resolves() $sandbox.stub(ServicesService, '_createK8sService').resolves() await $subject - expect(ServicesService._addTcpConnector).to.not.have.been.called - expect(ServicesService._addTcpListener).to.not.have.been.called expect(ServicesService._createK8sService).to.not.have.been.called }) }) @@ -135,7 +148,7 @@ describe('services-service platform reconcile enqueue', () => { $sandbox.stub(ServiceManager, 'update').callsFake((where, data) => Promise.resolve(buildServiceModel({ ...existingService, ...data })) ) - $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueServicePlatform').resolves() $sandbox.stub(RouterManager, 'findOne').resolves({ isDefault: true, host: 'hub.example.com', @@ -148,7 +161,7 @@ describe('services-service platform reconcile enqueue', () => { it('enqueues reconcile with old and new tags in snapshot', async () => { await $subject - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'spec-changed', specSnapshot: { @@ -183,19 +196,17 @@ describe('services-service platform reconcile enqueue', () => { beforeEach(() => { $sandbox.stub(ServiceManager, 'findOneWithTags').resolves(existingService) $sandbox.stub(ServiceManager, 'delete').resolves() - $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() - $sandbox.stub(ServicesService, '_deleteTcpConnector').resolves() - $sandbox.stub(ServicesService, '_deleteTcpListener').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueServicePlatform').resolves() $sandbox.stub(ServicesService, '_deleteK8sService').resolves() }) it('captures spec snapshot and enqueues delete reconcile before DB delete', async () => { await $subject - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledBefore( + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledBefore( ServiceManager.delete ) - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'delete', specSnapshot: { @@ -217,10 +228,42 @@ describe('services-service platform reconcile enqueue', () => { it('does not run hub teardown on the synchronous path', async () => { await $subject - expect(ServicesService._deleteTcpConnector).to.not.have.been.called - expect(ServicesService._deleteTcpListener).to.not.have.been.called expect(ServicesService._deleteK8sService).to.not.have.been.called }) + + it('captures full spec snapshot from Sequelize model instances', async () => { + ServiceManager.findOneWithTags.resolves(buildSequelizeLikeService({ + name: 'snapshot-service', + type: 'agent', + resource: 'fog-uuid-1', + defaultBridge: 'fog-uuid-1', + bridgePort: 9200, + targetPort: 8090, + servicePort: 9200, + k8sType: null, + serviceEndpoint: 'edge.example.com', + tags: [{ value: 'site-a' }] + })) + + await ServicesService.deleteServiceEndpoint('snapshot-service', $transaction) + + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ + serviceName: 'snapshot-service', + reason: 'delete', + specSnapshot: { + name: 'snapshot-service', + type: 'agent', + resource: 'fog-uuid-1', + defaultBridge: 'fog-uuid-1', + bridgePort: 9200, + targetPort: 8090, + servicePort: 9200, + k8sType: null, + serviceEndpoint: 'edge.example.com', + tags: ['site-a'] + } + }, $transaction) + }) }) describe('YAML endpoints', () => { @@ -261,7 +304,7 @@ spec: await ServiceController.createServiceYAMLEndpoint(req) expect(YamlParserService.parseServiceFile).to.have.been.calledOnceWith(serviceYaml) - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'spec-changed', specSnapshot: sinon.match({ @@ -284,7 +327,7 @@ spec: $sandbox.stub(ServiceManager, 'update').callsFake((where, data) => Promise.resolve(buildServiceModel({ ...existingService, ...data })) ) - $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueServicePlatform').resolves() $sandbox.stub(RouterManager, 'findOne').resolves({ isDefault: true, host: 'hub.example.com', @@ -313,7 +356,7 @@ spec: isUpdate: true, serviceName: 'api-gateway' }) - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'spec-changed', specSnapshot: sinon.match({ @@ -335,7 +378,7 @@ spec: tags: [{ value: 'site-a' }] })) $sandbox.stub(ServiceManager, 'update').resolves() - $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueServicePlatform').resolves() }) it('resets failed provisioning and enqueues manual retry', async () => { @@ -346,18 +389,53 @@ spec: { provisioningStatus: 'pending', provisioningError: null }, $transaction ) - expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ serviceName: 'api-gateway', reason: 'manual-retry', - specSnapshot: sinon.match({ + specSnapshot: { name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + serviceEndpoint: 'hub.example.com', tags: ['site-a'] - }) + } }, $transaction) expect(result.provisioningStatus).to.equal('pending') expect(result.provisioningError).to.be.null }) + it('captures full spec snapshot from Sequelize model instances', async () => { + ServiceManager.findOneWithTags.resolves(buildSequelizeLikeService({ + provisioningStatus: 'failed', + provisioningError: 'hub lock timeout', + tags: [{ value: 'site-a' }] + })) + + await ServicesService.reconcileServiceEndpoint('api-gateway', $transaction) + + expect(ReconcileOutboxManager.enqueueServicePlatform).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'manual-retry', + specSnapshot: { + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + serviceEndpoint: 'hub.example.com', + tags: ['site-a'] + } + }, $transaction) + }) + context('when service is missing', () => { beforeEach(() => { ServiceManager.findOneWithTags.resolves(null) @@ -367,4 +445,37 @@ spec: expect($subject).to.be.rejectedWith(Errors.NotFoundError)) }) }) + + describe('._syncK8sServiceResource()', () => { + const serviceConfig = { + name: 'snapshot-service', + k8sType: 'ClusterIP', + bridgePort: 10024, + servicePort: 10024, + tags: ['site-a'] + } + + beforeEach(() => { + $sandbox.stub(K8sClient, 'getService').resolves(null) + $sandbox.stub(K8sClient, 'createService').resolves({ metadata: { name: 'snapshot-service' } }) + $sandbox.stub(K8sClient, 'updateService').resolves({ metadata: { name: 'snapshot-service' } }) + }) + + it('creates the K8s service when it does not exist', async () => { + await ServicesService._syncK8sServiceResource(serviceConfig) + + expect(K8sClient.getService).to.have.been.calledWith('snapshot-service', { ignoreNotFound: true }) + expect(K8sClient.createService).to.have.been.calledOnce + expect(K8sClient.updateService).to.not.have.been.called + }) + + it('updates the K8s service when it already exists', async () => { + K8sClient.getService.resolves({ metadata: { name: 'snapshot-service' } }) + + await ServicesService._syncK8sServiceResource(serviceConfig) + + expect(K8sClient.createService).to.not.have.been.called + expect(K8sClient.updateService).to.have.been.calledOnceWith('snapshot-service', sinon.match.object) + }) + }) }) diff --git a/test/src/services/transaction-safety-19h.test.js b/test/src/services/transaction-safety-19h.test.js new file mode 100644 index 00000000..da5d3e6a --- /dev/null +++ b/test/src/services/transaction-safety-19h.test.js @@ -0,0 +1,189 @@ +'use strict' + +const { expect } = require('chai') +const sinon = require('sinon') +const Transaction = require('sequelize/lib/transaction') + +const SecretService = require('../../../src/services/secret-service') +const CertificateService = require('../../../src/services/certificate-service') +const Errors = require('../../../src/helpers/errors') + +describe('Plan 19-H transaction safety fixes', () => { + def('sandbox', () => sinon.createSandbox()) + def('parentTransaction', () => Object.create(Transaction.prototype)) + + afterEach(() => { + $sandbox.restore() + }) + + describe('certificate-service transaction propagation', () => { + it('passes parent transaction to SecretService.getSecretEndpoint from getCAEndpoint', async () => { + const CertificateManager = require('../../../src/data/managers/certificate-manager') + $sandbox.stub(CertificateManager, 'findCertificateByName').resolves({ + name: 'router-local-ca', + subject: 'router-local-ca', + isCA: true, + validFrom: new Date(), + validTo: new Date(), + serialNumber: '1', + isExpired: () => false + }) + $sandbox.stub(SecretService, 'getSecretEndpoint').resolves({ + type: 'tls', + data: { + 'tls.crt': Buffer.from('cert').toString('base64'), + 'tls.key': Buffer.from('key').toString('base64') + } + }) + + await CertificateService.getCAEndpoint('router-local-ca', $parentTransaction) + + expect(SecretService.getSecretEndpoint).to.have.been.calledOnceWith( + 'router-local-ca', + $parentTransaction + ) + }) + + it('passes parent transaction through createCAEndpoint SecretService calls', async () => { + $sandbox.stub(SecretService, 'getSecretEndpoint').rejects(new Errors.NotFoundError('missing')) + $sandbox.stub(require('../../../src/utils/cert'), 'generateSelfSignedCA').resolves({ + cert: '-----BEGIN CERTIFICATE-----\ncert\n-----END CERTIFICATE-----', + key: '-----BEGIN PRIVATE KEY-----\nkey\n-----END PRIVATE KEY-----' + }) + $sandbox.stub(require('../../../src/utils/cert'), 'storeCA').resolves() + $sandbox.stub(SecretService, 'createSecretEndpoint').resolves({ id: 1, name: 'test-ca' }) + $sandbox.stub(require('../../../src/data/managers/secret-manager'), 'findOne').resolves({ id: 1 }) + $sandbox.stub(require('../../../src/data/managers/certificate-manager'), 'createCertificateRecord').resolves() + + await CertificateService.createCAEndpoint({ + name: 'test-ca', + subject: 'test-ca', + expiration: 60, + type: 'self-signed' + }, $parentTransaction) + + expect(SecretService.getSecretEndpoint).to.have.been.calledWith('test-ca', $parentTransaction) + }) + + it('passes parent transaction to loadCA from createCAEndpoint direct type', async () => { + const certUtil = require('../../../src/utils/cert') + const forge = require('node-forge') + + const keys = forge.pki.rsa.generateKeyPair(2048) + const caCert = forge.pki.createCertificate() + caCert.publicKey = keys.publicKey + caCert.serialNumber = '01' + caCert.validity.notBefore = new Date() + caCert.validity.notAfter = new Date(Date.now() + 86400000) + caCert.setSubject([{ name: 'commonName', value: 'router-site-ca' }]) + caCert.setIssuer([{ name: 'commonName', value: 'router-site-ca' }]) + caCert.sign(keys.privateKey, forge.md.sha256.create()) + + const caCertPem = forge.pki.certificateToPem(caCert) + const caKeyPem = forge.pki.privateKeyToPem(keys.privateKey) + + $sandbox.stub(SecretService, 'getSecretEndpoint').resolves({ type: 'tls', data: {} }) + $sandbox.stub(require('../../../src/data/managers/certificate-manager'), 'findCertificateByName').resolves(null) + $sandbox.stub(certUtil, 'loadCA').resolves({ cert: caCertPem, key: caKeyPem }) + $sandbox.stub(require('../../../src/data/managers/secret-manager'), 'findOne').resolves({ id: 1 }) + $sandbox.stub(require('../../../src/data/managers/certificate-manager'), 'createCertificateRecord').resolves() + + await CertificateService.createCAEndpoint({ + name: 'router-site-ca', + secretName: 'router-site-ca', + type: 'direct' + }, $parentTransaction) + + expect(certUtil.loadCA).to.have.been.calledOnceWith('router-site-ca', $parentTransaction) + }) + }) + + describe('cert.js transaction propagation', () => { + it('loadCA uses parent transaction without enqueueing runInTransaction', async () => { + const transactionRunner = require('../../../src/helpers/transaction-runner') + const SecretManager = require('../../../src/data/managers/secret-manager') + const { loadCA } = require('../../../src/utils/cert') + + $sandbox.stub(transactionRunner, 'runInTransaction').throws(new Error('should not enqueue')) + $sandbox.stub(SecretManager, 'getSecret').resolves({ + type: 'tls', + data: { + 'tls.crt': Buffer.from('cert').toString('base64'), + 'tls.key': Buffer.from('key').toString('base64') + } + }) + + const result = await loadCA('router-site-ca', $parentTransaction) + + expect(SecretManager.getSecret).to.have.been.calledOnceWith('router-site-ca', $parentTransaction) + expect(transactionRunner.runInTransaction).to.not.have.been.called + expect(result).to.include.keys('cert', 'key') + }) + + it('getCAFromK8sSecret uses parent transaction without enqueueing runInTransaction', async () => { + const transactionRunner = require('../../../src/helpers/transaction-runner') + const SecretManager = require('../../../src/data/managers/secret-manager') + const k8sClient = require('../../../src/utils/k8s-client') + const { getCAFromK8sSecret } = require('../../../src/utils/cert') + + $sandbox.stub(transactionRunner, 'runInTransaction').throws(new Error('should not enqueue')) + $sandbox.stub(k8sClient, 'getSecret').resolves({ + data: { + 'tls.crt': Buffer.from('cert').toString('base64'), + 'tls.key': Buffer.from('key').toString('base64') + } + }) + $sandbox.stub(SecretManager, 'findOne').resolves({ id: 1, name: 'k8s-ca' }) + + await getCAFromK8sSecret('k8s-ca', $parentTransaction) + + expect(SecretManager.findOne).to.have.been.calledOnceWith({ name: 'k8s-ca' }, $parentTransaction) + expect(transactionRunner.runInTransaction).to.not.have.been.called + }) + + it('generateCertificate passes transaction through getCAFromInput loadCA path', async () => { + const transactionRunner = require('../../../src/helpers/transaction-runner') + const SecretManager = require('../../../src/data/managers/secret-manager') + const certUtil = require('../../../src/utils/cert') + const forge = require('node-forge') + + const keys = forge.pki.rsa.generateKeyPair(2048) + const caCert = forge.pki.createCertificate() + caCert.publicKey = keys.publicKey + caCert.serialNumber = '01' + caCert.validity.notBefore = new Date() + caCert.validity.notAfter = new Date(Date.now() + 86400000) + caCert.setSubject([{ name: 'commonName', value: 'router-site-ca' }]) + caCert.setIssuer([{ name: 'commonName', value: 'router-site-ca' }]) + caCert.sign(keys.privateKey, forge.md.sha256.create()) + + const caCertPem = forge.pki.certificateToPem(caCert) + const caKeyPem = forge.pki.privateKeyToPem(keys.privateKey) + + $sandbox.stub(transactionRunner, 'runInTransaction').throws(new Error('should not enqueue')) + $sandbox.stub(SecretManager, 'getSecret').resolves({ + type: 'tls', + data: { + 'tls.crt': Buffer.from(caCertPem).toString('base64'), + 'tls.key': Buffer.from(caKeyPem).toString('base64') + } + }) + $sandbox.stub(SecretService, 'createSecretEndpoint').resolves() + + await certUtil.generateCertificate({ + name: 'site-server', + subject: '/CN=site-server', + hosts: '127.0.0.1', + ca: { type: 'direct', secretName: 'router-site-ca' }, + transaction: $parentTransaction + }) + + expect(SecretManager.getSecret).to.have.been.calledOnceWith('router-site-ca', $parentTransaction) + expect(transactionRunner.runInTransaction).to.not.have.been.called + expect(SecretService.createSecretEndpoint).to.have.been.calledWith( + sinon.match.has('name', 'site-server'), + $parentTransaction + ) + }) + }) +}) diff --git a/test/src/services/transaction-safety-vault.test.js b/test/src/services/transaction-safety-vault.test.js new file mode 100644 index 00000000..50ac2d89 --- /dev/null +++ b/test/src/services/transaction-safety-vault.test.js @@ -0,0 +1,119 @@ +'use strict' + +const { expect } = require('chai') +const sinon = require('sinon') +const Transaction = require('sequelize/lib/transaction') + +const SecretService = require('../../../src/services/secret-service') +const RegistryService = require('../../../src/services/registry-service') +const ConfigMapManager = require('../../../src/data/managers/config-map-manager') +const RegistryManager = require('../../../src/data/managers/registry-manager') +const BaseManager = require('../../../src/data/managers/base-manager') +const SecretHelper = require('../../../src/helpers/secret-helper') +const vaultManager = require('../../../src/vault/vault-manager') + +describe('Plan 19-H vault transaction safety (R-09–R-11)', () => { + def('sandbox', () => sinon.createSandbox()) + def('parentTransaction', () => { + const tx = Object.create(Transaction.prototype) + tx.afterCommit = sinon.spy((fn) => fn()) + return tx + }) + + afterEach(() => { + $sandbox.restore() + }) + + describe('secret-service deleteSecretEndpoint', () => { + beforeEach(() => { + $sandbox.stub(vaultManager, 'isEnabled').returns(true) + $sandbox.stub(SecretHelper, 'deleteSecret').resolves() + $sandbox.stub(require('../../../src/data/managers/secret-manager'), 'findOne').resolves({ + name: 'test-secret', + type: 'Opaque' + }) + $sandbox.stub(require('../../../src/data/managers/secret-manager'), 'deleteSecret').resolves() + $sandbox.stub(require('../../../src/data/managers/volume-mounting-manager'), 'findAll').resolves([]) + }) + + it('schedules vault delete after commit instead of calling SecretHelper inside the tx body', async () => { + const deferredTx = Object.create(Transaction.prototype) + let deferredFn + deferredTx.afterCommit = sinon.spy((fn) => { + deferredFn = fn + }) + + await SecretService.deleteSecretEndpoint('test-secret', deferredTx) + + expect(deferredTx.afterCommit).to.have.been.calledOnce + expect(SecretHelper.deleteSecret).to.not.have.been.called + + await deferredFn() + + expect(SecretHelper.deleteSecret).to.have.been.calledOnceWith('test-secret', 'Opaque') + }) + }) + + describe('config-map-manager deleteConfigMap', () => { + beforeEach(() => { + $sandbox.stub(vaultManager, 'isEnabled').returns(true) + $sandbox.stub(ConfigMapManager, 'findOne').resolves({ + name: 'cfg', + useVault: true + }) + $sandbox.stub(ConfigMapManager, 'delete').resolves(1) + $sandbox.stub(SecretHelper, 'deleteSecret').resolves() + }) + + it('deletes DB row in tx and schedules vault cleanup after commit', async () => { + await ConfigMapManager.deleteConfigMap('cfg', $parentTransaction) + + expect(ConfigMapManager.delete).to.have.been.calledBefore(SecretHelper.deleteSecret) + expect(SecretHelper.deleteSecret).to.have.been.calledOnceWith('cfg', 'configmap') + }) + }) + + describe('registry-service createRegistry', () => { + beforeEach(() => { + $sandbox.stub(vaultManager, 'isEnabled').returns(true) + $sandbox.stub(require('../../../src/schemas'), 'validate').resolves(true) + $sandbox.stub(require('../../../src/helpers/app-helper'), 'deleteUndefinedFields').callsFake((v) => v) + $sandbox.stub(RegistryManager, 'create').resolves({ id: 16 }) + $sandbox.stub(SecretHelper, 'encryptSecretInternal').resolves('internal-encrypted') + $sandbox.stub(SecretHelper, 'encryptSecret').resolves('vault-ref') + $sandbox.stub(RegistryManager, 'update').resolves() + $sandbox.stub(require('../../../src/data/managers/iofog-manager'), 'findAll').resolves([]) + $sandbox.stub(require('../../../src/services/change-tracking-service'), 'update').resolves() + $sandbox.stub(require('../../../src/helpers/transaction-runner'), 'runInTransaction').resolves() + }) + + it('stores internal encryption in tx and promotes to vault after commit', async () => { + await RegistryService.createRegistry({ + url: 'https://registry.example.com', + username: 'user', + password: 'plain-password', + isPublic: false, + email: 'user@example.com' + }, $parentTransaction) + + expect(SecretHelper.encryptSecretInternal).to.have.been.calledOnce + expect(SecretHelper.encryptSecret).to.not.have.been.called + }) + }) + + describe('registry-manager delete', () => { + beforeEach(() => { + $sandbox.stub(vaultManager, 'isEnabled').returns(true) + $sandbox.stub(RegistryManager, 'findOne').resolves({ id: 16 }) + $sandbox.stub(BaseManager.prototype, 'delete').resolves(1) + $sandbox.stub(SecretHelper, 'deleteSecret').resolves() + }) + + it('deletes DB row first and schedules vault cleanup after commit', async () => { + await RegistryManager.delete({ id: 16 }, $parentTransaction) + + expect(BaseManager.prototype.delete).to.have.been.calledBefore(SecretHelper.deleteSecret) + expect(SecretHelper.deleteSecret).to.have.been.calledOnceWith('registry-16', 'registry') + }) + }) +}) diff --git a/test/src/websocket/ws-cross-replica-nats.test.js b/test/src/websocket/ws-cross-replica-nats.test.js index 277177ac..fd313354 100644 --- a/test/src/websocket/ws-cross-replica-nats.test.js +++ b/test/src/websocket/ws-cross-replica-nats.test.js @@ -81,12 +81,12 @@ describe('WebSocket exec/log — cross-replica mock NATS', () => { userWs, transaction ) - await wsServer.setupExecMessageForwarding(sessionId, transaction) + await wsServer.setupExecMessageForwarding(sessionId) await delay(50) const session = wsServer.execSessionManager.getExecSession(sessionId) session.agent = agentWs - await wsServer.setupExecMessageForwarding(sessionId, transaction) + await wsServer.setupExecMessageForwarding(sessionId) await delay(50) expect(mockRelay.shouldUseRelay(sessionId)).to.equal(true) @@ -164,6 +164,7 @@ describe('WebSocket exec/log — cross-replica mock NATS', () => { false, transaction ) + await delay(50) expect(mockRelay.shouldUseRelay($ids.sessionId)).to.equal(true) diff --git a/test/src/websocket/ws-cross-replica.test.js b/test/src/websocket/ws-cross-replica.test.js index 0712ffac..5042c9fc 100644 --- a/test/src/websocket/ws-cross-replica.test.js +++ b/test/src/websocket/ws-cross-replica.test.js @@ -79,12 +79,12 @@ describe('WebSocket exec/log — cross-replica mock AMQP', () => { userWs, transaction ) - await wsServer.setupExecMessageForwarding(sessionId, transaction) + await wsServer.setupExecMessageForwarding(sessionId) await delay(50) const session = wsServer.execSessionManager.getExecSession(sessionId) session.agent = agentWs - await wsServer.setupExecMessageForwarding(sessionId, transaction) + await wsServer.setupExecMessageForwarding(sessionId) await delay(50) expect(mockRelay.shouldUseRelay(sessionId)).to.equal(true) @@ -164,6 +164,7 @@ describe('WebSocket exec/log — cross-replica mock AMQP', () => { false, transaction ) + await delay(50) expect(mockRelay.shouldUseRelay($ids.sessionId)).to.equal(true) expect(wsServer.execSessionManager.getExecSession($ids.sessionId).agent).to.equal(null) diff --git a/test/src/websocket/ws-exec-activation-failfast.test.js b/test/src/websocket/ws-exec-activation-failfast.test.js index 4052a708..2405fb01 100644 --- a/test/src/websocket/ws-exec-activation-failfast.test.js +++ b/test/src/websocket/ws-exec-activation-failfast.test.js @@ -40,7 +40,7 @@ describe('WebSocket exec activation fail-fast', () => { $sandbox.stub(logger, 'warn') $sandbox.stub(logger, 'debug') $sandbox.stub(wsServer, 'sendMessageToAgent').resolves(false) - $sandbox.stub(wsServer, 'cleanupExecSession').resolves() + $sandbox.stub(wsServer, '_cleanupExecSessionInTransaction').resolves() }) afterEach(() => { @@ -62,7 +62,7 @@ describe('WebSocket exec activation fail-fast', () => { transaction ) - await wsServer.setupExecMessageForwarding(sessionId, transaction) + await wsServer.setupExecMessageForwarding(sessionId) await delay(20) const setupCompleteCalls = logger.info.getCalls().filter((call) => { diff --git a/test/src/websocket/ws-lifecycle.test.js b/test/src/websocket/ws-lifecycle.test.js index 2d598e0e..41cec233 100644 --- a/test/src/websocket/ws-lifecycle.test.js +++ b/test/src/websocket/ws-lifecycle.test.js @@ -9,6 +9,7 @@ const FogManager = require('../../../src/data/managers/iofog-manager') const WebSocketServerClass = require('../../../src/websocket/server') const MicroserviceExecSessionManager = require('../../../src/data/managers/microservice-exec-session-manager') const MicroserviceLogStatusManager = require('../../../src/data/managers/microservice-log-status-manager') +const AppHelper = require('../../../src/helpers/app-helper') const MicroserviceManager = require('../../../src/data/managers/microservice-manager') const { createMockWebSocket, @@ -258,4 +259,49 @@ describe('WebSocket session lifecycle', () => { ) }) }) + + describe('relay setup deferred until transaction commits', () => { + let wsServer + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + }) + + afterEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('does not await relay setup before handleUserLogsConnection returns', async () => { + let setupStarted = false + $sandbox.stub(wsServer, 'validateUserLogsConnection').resolves({ success: true }) + $sandbox.stub(wsServer, 'countLogSessionsInDb').resolves(0) + $sandbox.stub(wsServer, 'isValidISO8601').returns(true) + $sandbox.stub(wsServer, 'setupLogMessageForwarding').callsFake(async () => { + setupStarted = true + }) + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid, uuid: $ids.microserviceUuid }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(MicroserviceLogStatusManager, 'create').resolves() + $sandbox.stub(AppHelper, 'generateUUID').returns($ids.sessionId) + + const ws = createMockWebSocket() + const req = createMockRequest(`/api/v3/microservices/${$ids.microserviceUuid}/logs?tail=100`) + + await wsServer.handleUserLogsConnection( + ws, + req, + 'Bearer token', + $ids.microserviceUuid, + null, + false, + $transaction + ) + + expect(setupStarted).to.equal(false) + await delay(10) + expect(setupStarted).to.equal(true) + }) + }) }) diff --git a/test/support/first-fog-sqlite-harness.js b/test/support/first-fog-sqlite-harness.js new file mode 100644 index 00000000..52c8df44 --- /dev/null +++ b/test/support/first-fog-sqlite-harness.js @@ -0,0 +1,147 @@ +'use strict' + +const fs = require('fs') +const path = require('path') +const express = require('express') + +const BOOTSTRAP_PASSWORD = 'ChangeMeSecure123!' + +const ENV_KEYS = [ + 'DB_PROVIDER', + 'DB_NAME', + 'AUTH_MODE', + 'CONTROLLER_PUBLIC_URL', + 'AUTH_INSECURE_ALLOW_HTTP', + 'OIDC_BOOTSTRAP_ADMIN_USERNAME', + 'OIDC_BOOTSTRAP_ADMIN_PASSWORD', + 'CONTROL_PLANE', + 'NODE_ENV' +] + +function snapshotEnv (keys) { + return Object.fromEntries(keys.map((key) => [key, process.env[key]])) +} + +function restoreEnv (snapshot) { + for (const key of ENV_KEYS) { + if (snapshot[key] === undefined) { + delete process.env[key] + } else { + process.env[key] = snapshot[key] + } + } +} + +function applyEnv (values) { + for (const [key, value] of Object.entries(values)) { + process.env[key] = value + } +} + +function sqliteStoragePath (dbName) { + return path.resolve(__dirname, '../../src/data/sqlite_files', dbName) +} + +function cleanupSqliteFiles (dbName) { + const base = sqliteStoragePath(dbName) + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(base + suffix) + } catch (_) { /* ignore */ } + } +} + +function installBusyRetryCounter (onRetry) { + const dbMetrics = require('../../src/helpers/db-metrics') + const original = dbMetrics.recordBusyRetry + dbMetrics.recordBusyRetry = (...args) => { + onRetry() + return original(...args) + } + return () => { + dbMetrics.recordBusyRetry = original + } +} + +async function sleep (ms) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +async function driveReconcileUntilReady (fogUuid, { + timeoutMs = 10000, + drainOnce, + processNextFogTask, + processNextNatsTask = async () => {}, + getStatus +}) { + const startedAt = Date.now() + while (Date.now() - startedAt < timeoutMs) { + await drainOnce() + await processNextFogTask() + await processNextNatsTask() + + const status = await getStatus(fogUuid) + if (status && status.phase === 'Ready') { + return status + } + + await sleep(50) + } + + const lastStatus = await getStatus(fogUuid) + throw new Error( + `Timed out waiting for Ready (last phase: ${lastStatus && lastStatus.phase}, ` + + `lastError: ${lastStatus && lastStatus.lastError})` + ) +} + +async function createFirstFogSqliteHarness () { + const envSnapshot = snapshotEnv(ENV_KEYS) + const dbName = `first-fog-int-${Date.now()}-${Math.random().toString(36).slice(2)}.sqlite` + + applyEnv({ + DB_PROVIDER: 'sqlite', + DB_NAME: dbName, + AUTH_MODE: 'embedded', + CONTROLLER_PUBLIC_URL: 'http://controller.test', + AUTH_INSECURE_ALLOW_HTTP: 'true', + OIDC_BOOTSTRAP_ADMIN_USERNAME: 'admin', + OIDC_BOOTSTRAP_ADMIN_PASSWORD: BOOTSTRAP_PASSWORD, + NODE_ENV: 'test' + }) + delete process.env.CONTROL_PLANE + + const { _resetQueueForTests } = require('../../src/helpers/transaction-runner') + _resetQueueForTests() + + const { initialize } = require('../../src/init') + await initialize() + + const { runBootstrap } = require('../../src/services/auth-bootstrap-service') + await runBootstrap() + + const db = require('../../src/data/models') + const { initEmbeddedIssuer, resetEmbeddedIssuerForTests } = require('../../src/config/embedded-oidc') + const { resetSigningMaterialCacheForTests } = require('../../src/config/auth-jwks') + await initEmbeddedIssuer(express(), { db }) + + return { + bootstrapPassword: BOOTSTRAP_PASSWORD, + dbName, + async teardown () { + await db.sequelize.close() + _resetQueueForTests() + resetEmbeddedIssuerForTests() + resetSigningMaterialCacheForTests() + cleanupSqliteFiles(dbName) + restoreEnv(envSnapshot) + } + } +} + +module.exports = { + BOOTSTRAP_PASSWORD, + createFirstFogSqliteHarness, + driveReconcileUntilReady, + installBusyRetryCounter +} From 9dd3ae45512f3719edcf4bae1aedb0a2ee0aacd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 23:11:02 +0300 Subject: [PATCH 11/32] bump console ui version 1.0.5 --- .env.example | 2 +- .github/actions/set-build-env/action.yml | 2 +- Dockerfile | 4 ++-- Makefile | 4 ++-- docs/swagger.yaml | 2 +- package.json | 6 +++--- scripts/build-console-dev.js | 2 +- test/src/helpers/transaction-grep-gates.test.js | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.env.example b/.env.example index 06b463aa..e99bc01e 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ NODE_ENV=development # EdgeOps Console static embed (npm run build:console → dev/console/build) EDGEOPS_CONSOLE_PATH=dev/console/build # must be absolute path -EDGEOPS_CONSOLE_VERSION=v1.0.4 +EDGEOPS_CONSOLE_VERSION=v1.0.5 # EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=datasance diff --git a/.github/actions/set-build-env/action.yml b/.github/actions/set-build-env/action.yml index f1ffc0e8..13d66dbd 100644 --- a/.github/actions/set-build-env/action.yml +++ b/.github/actions/set-build-env/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | VERSION="${{ env.EDGEOPS_CONSOLE_VERSION }}" - if [ -z "$VERSION" ]; then VERSION="1.0.4"; fi + if [ -z "$VERSION" ]; then VERSION="1.0.5"; fi echo "EDGEOPS_CONSOLE_VERSION=$VERSION" >> "${GITHUB_ENV}" REPO="${{ env.EDGEOPS_CONSOLE_REPO }}" diff --git a/Dockerfile b/Dockerfile index 9155d0b1..224e4fe6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM node:24-bookworm@sha256:fdddfb3e688158251943d52eba361de991548f6814007acba4917ae6b512d6be AS console-builder ARG EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console -ARG EDGEOPS_CONSOLE_VERSION=v1.0.4 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.5 ARG EDGEOPS_CONSOLE_FLAVOR=datasance RUN apt-get update \ @@ -50,7 +50,7 @@ RUN npm pack # ubi9/nodejs-24-minimal:latest — pin manifest list digest for reproducible multi-arch builds FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:cc7648f8e1c7d628e4334328a712f30ea0820787bb92836cc93e349674c689bf -ARG EDGEOPS_CONSOLE_VERSION=v1.0.4 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.5 ARG IMAGE_REGISTRY ARG OCI_SOURCE_REPO ARG CONTROLLER_DISTRIBUTION=iofog diff --git a/Makefile b/Makefile index ba700542..96124132 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Local Docker build — mirrors CI/release build-args (see .github/actions/set-build-env). -# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.4 +# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.5 FLAVOR ?= datasance IMAGE_NAME ?= controller @@ -25,7 +25,7 @@ else $(error FLAVOR must be "datasance" or "iofog", got "$(FLAVOR)") endif -EDGEOPS_CONSOLE_VERSION ?= v1.0.4 +EDGEOPS_CONSOLE_VERSION ?= v1.0.5 IMAGE_REF = $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(DOCKER_TAG) diff --git a/docs/swagger.yaml b/docs/swagger.yaml index abd65f4d..ea133080 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -2774,7 +2774,7 @@ paths: `{ sessionId, microserviceUuid }` in the `data` field and top-level `sessionId`, followed by STDERR "waiting for agent…". Relay frames use `execId` equal to `sessionId`. - **HA (R112):** Multi-replica deployments require a **relay backend** selected at startup by **`nats.enabled`**: AMQP router queues when `false` (default), NATS Core pub/sub on the platform hub when `true`. Cross-replica sessions fail fast with close code **1013** when the active relay backend is unavailable. See `docs/operations/ws-sessions.md`. + **HA:** Multi-replica deployments require a **relay backend** selected at startup by **`nats.enabled`**: AMQP router queues when `false` (default), NATS Core pub/sub on the platform hub when `true`. Cross-replica sessions fail fast with close code **1013** when the active relay backend is unavailable. See `docs/operations/ws-sessions.md`. See `#/components/schemas/WsExecMessageTypes` and `#/components/schemas/WsCloseCodes`. operationId: userMicroserviceExecWebSocket diff --git a/package.json b/package.json index 97e61ead..53eda8fe 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,8 @@ "main": "./src/main.js", "author": "Eclipse ioFog Project", "contributors": [ + "Emirhan Durmus ", + "Alpaslan Doğan ", "Kilton Hopkins ", "Saeid Rezaei Baghbidi", "Alexandre de Wergifosse", @@ -20,9 +22,7 @@ "Eugene Pankov", "Maksim Chepelev", "Tetiana Yatsiuk", - "Sergey Valevich", - "Emirhan Durmus ", - "Alpaslan Doğan " + "Sergey Valevich" ], "license": "EPL-2.0", "engines": { diff --git a/scripts/build-console-dev.js b/scripts/build-console-dev.js index 3ea5e754..49f567b4 100644 --- a/scripts/build-console-dev.js +++ b/scripts/build-console-dev.js @@ -9,7 +9,7 @@ const CONSOLE_DIR = path.join(DEV_DIR, 'console') const BUILD_OUT = path.join(CONSOLE_DIR, 'build') const REPO = process.env.EDGEOPS_CONSOLE_REPO || 'https://github.com/Datasance/edgeops-console' -const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.4' +const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.5' const FLAVOR = process.env.EDGEOPS_CONSOLE_FLAVOR || 'datasance' function normalizeTag (version) { diff --git a/test/src/helpers/transaction-grep-gates.test.js b/test/src/helpers/transaction-grep-gates.test.js index 29bb1391..79368677 100644 --- a/test/src/helpers/transaction-grep-gates.test.js +++ b/test/src/helpers/transaction-grep-gates.test.js @@ -235,7 +235,7 @@ describe('grep gates', () => { ) expect(adapterSource).to.include('runInTransaction') expect(adapterSource).to.match(/label: 'oidc\.adapter\.upsert'/) - expect(adapterSource).to.match(/\}, \{ transaction \}\)/) + expect(adapterSource).to.match(/\{\s*transaction,/) }) it('passes transaction inside Sequelize options in volume-mounting-manager.js', () => { From 6207c57670d3f3d8821ebedb693f23b36785a7b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Tue, 30 Jun 2026 23:53:03 +0300 Subject: [PATCH 12/32] fix (text): tx grep test fixed --- test/src/helpers/transaction-grep-gates.test.js | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/src/helpers/transaction-grep-gates.test.js b/test/src/helpers/transaction-grep-gates.test.js index 79368677..c2786301 100644 --- a/test/src/helpers/transaction-grep-gates.test.js +++ b/test/src/helpers/transaction-grep-gates.test.js @@ -68,10 +68,9 @@ describe('grep gates', () => { }) it('passes transaction to SecretService reads inside certificate-service.js', () => { - const hits = grepSrc('SecretService\\.getSecretEndpoint\\([^,\n]+\\)', [ - '--include=certificate-service.js', - 'src/services' - ]) + const hits = grepSrc('SecretService\\.getSecretEndpoint\\([^,)]+\\)', [ + '--include=certificate-service.js' + ], 'src/services') expect(hits).to.equal('') }) From 1b5f9ba959948ad48cb5f9badc81b3b3366793dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 2 Jul 2026 10:40:48 +0300 Subject: [PATCH 13/32] Fix semver versionRegex escaping for JSON Schema validation. Double-escape backslashes so AJV compiles the pattern correctly and prerelease semver strings validate as intended. --- src/schemas/utils/utils.js | 2 +- test/src/schemas/utils.test.js | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 test/src/schemas/utils.test.js diff --git a/src/schemas/utils/utils.js b/src/schemas/utils/utils.js index 9fcd98e9..d4e6ec3c 100644 --- a/src/schemas/utils/utils.js +++ b/src/schemas/utils/utils.js @@ -6,5 +6,5 @@ module.exports = { colorRegex: '^(#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8}))|(rgb\(\s*(?:(\d{1,3})\s*,?){3}\))|(rgba\(\s*(?:(\d{1,3})\s*,?){4}\))|$', // https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string // https://regex101.com/r/vkijKf/380 - versionRegex: '^v?(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:[+]([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$' + versionRegex: '^v?(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$' } diff --git a/test/src/schemas/utils.test.js b/test/src/schemas/utils.test.js new file mode 100644 index 00000000..69c451b1 --- /dev/null +++ b/test/src/schemas/utils.test.js @@ -0,0 +1,31 @@ +'use strict' + +const { expect } = require('chai') +const { versionRegex } = require('../../../src/schemas/utils/utils') + +describe('schemas/utils versionRegex', () => { + const re = new RegExp(versionRegex) + + for (const version of [ + '1.0.0', + 'v1.0.0', + '2.0.0-rc.2', + 'v2.0.0-rc.2', + '1.0.0-rc.10', + 'v1.0.0-rc.10', + '1.0.0-beta.11', + '1.0.0-alpha+001', + '1.0.0+20130313144700', + '1.0.0-x.7.z.92' + ]) { + it(`accepts ${version}`, () => { + expect(re.test(version)).to.equal(true) + }) + } + + for (const version of ['01.0.0', '1.0.0-rc.01', '1.0', 'not-a-version']) { + it(`rejects ${version}`, () => { + expect(re.test(version)).to.equal(false) + }) + } +}) From e96613d92e333bb9dca41ff07f73cb330bb5e993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 2 Jul 2026 10:40:53 +0300 Subject: [PATCH 14/32] Fix fog platform reconcile when upstreamRouters is omitted from spec. Preserve existing upstream connections on update, pass undefined on first create, and honor an explicit empty upstreamRouters list without applying defaults. --- src/services/fog-platform-service.js | 12 +++- .../src/services/fog-platform-service.test.js | 68 +++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/src/services/fog-platform-service.js b/src/services/fog-platform-service.js index 533e996c..60ba59bd 100644 --- a/src/services/fog-platform-service.js +++ b/src/services/fog-platform-service.js @@ -262,8 +262,16 @@ async function reconcileFogPlatform (fogUuid, prep, transaction) { const upstreamConnections = router ? await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) : [] - const upstreamRoutersIofogUuid = spec.upstreamRouters || (upstreamConnections || []) - .map((connection) => connection.dest.iofogUuid) + let upstreamRoutersIofogUuid + if (spec.upstreamRouters !== undefined) { + upstreamRoutersIofogUuid = spec.upstreamRouters + } else if (upstreamConnections && upstreamConnections.length > 0) { + upstreamRoutersIofogUuid = upstreamConnections.map( + (connection) => _getRouterUuid(connection.dest, defaultRouter) + ) + } else { + upstreamRoutersIofogUuid = undefined + } const upstreamRouters = await RouterService.validateAndReturnUpstreamRouters( upstreamRoutersIofogUuid, fog.isSystem, diff --git a/test/src/services/fog-platform-service.test.js b/test/src/services/fog-platform-service.test.js index ecb93053..6c77c8a7 100644 --- a/test/src/services/fog-platform-service.test.js +++ b/test/src/services/fog-platform-service.test.js @@ -2,6 +2,7 @@ const { expect } = require('chai') const sinon = require('sinon') const FogPlatformService = require('../../../src/services/fog-platform-service') +const Constants = require('../../../src/helpers/constants') const FogManager = require('../../../src/data/managers/iofog-manager') const FogPlatformSpecManager = require('../../../src/data/managers/fog-platform-spec-manager') const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') @@ -211,6 +212,73 @@ describe('Fog platform service', () => { fogUuids: [fogUuid] }, transaction) }) + + context('when upstreamRouters is omitted from spec', () => { + it('passes undefined to validateAndReturnUpstreamRouters on first create', async () => { + RouterManager.findOne.callsFake((query) => { + if (query && query.isDefault) { + return Promise.resolve({ id: 1, iofogUuid: 'default', isDefault: true }) + } + if (query && query.iofogUuid === fogUuid) { + return Promise.resolve(null) + } + return Promise.resolve(null) + }) + RouterConnectionManager.findAllWithRouters.resolves([]) + RouterService.validateAndReturnUpstreamRouters.resolves([{ id: 1, iofogUuid: 'default' }]) + $sandbox.stub(RouterService, 'createRouterForFog').resolves({ + id: 99, + iofogUuid: fogUuid, + isEdge: true + }) + + await FogPlatformService.reconcileFog(fogUuid) + + expect(RouterService.validateAndReturnUpstreamRouters).to.have.been.calledWith( + undefined, + false, + sinon.match({ id: 1, isDefault: true }), + transaction + ) + expect(RouterService.createRouterForFog).to.have.been.calledOnce + expect(RouterService.updateRouter).to.not.have.been.called + }) + + it('preserves existing upstream connections when spec omits upstreamRouters', async () => { + const upstreamConnection = { + dest: { id: 1, iofogUuid: 'default', isDefault: true } + } + RouterConnectionManager.findAllWithRouters.resolves([upstreamConnection]) + RouterService.validateAndReturnUpstreamRouters.resolves([{ id: 1, iofogUuid: 'default' }]) + + await FogPlatformService.reconcileFog(fogUuid) + + expect(RouterService.validateAndReturnUpstreamRouters).to.have.been.calledWith( + [Constants.DEFAULT_ROUTER_NAME], + false, + sinon.match({ id: 1, isDefault: true }), + transaction + ) + }) + + it('passes explicit empty upstreamRouters without applying defaults', async () => { + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid, + generation: 2, + spec: { ...spec, upstreamRouters: [] } + }) + RouterConnectionManager.findAllWithRouters.resolves([]) + + await FogPlatformService.reconcileFog(fogUuid) + + expect(RouterService.validateAndReturnUpstreamRouters).to.have.been.calledWith( + [], + false, + sinon.match({ id: 1, isDefault: true }), + transaction + ) + }) + }) }) describe('.markReconcileFailed()', () => { From a4a4bdd12598f435143c8c5762d1cca4da6dcea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 2 Jul 2026 10:40:59 +0300 Subject: [PATCH 15/32] Harden exec and log WebSocket sessions for multi-replica HA relay. Dedupe concurrent session cleanup to avoid transaction reuse races, fix cross-replica pairing and activation ordering, mark remoteUserPaired on agent-only pods, extend expiry and partial-disconnect handling for split sessions, notify remote peers on expiry/teardown, and add relay delivery hooks to refresh agent-side activity. --- src/services/amqp-relay-transport.js | 18 + src/services/nats-relay-transport-impl.js | 77 +- src/services/nats-relay-transport.js | 18 + src/services/websocket-queue-service.js | 79 +- src/websocket/exec-session-manager.js | 70 +- src/websocket/log-session-manager.js | 69 +- src/websocket/server.js | 1336 +++++++++++++---- .../src/services/nats-relay-transport.test.js | 53 + .../websocket/ws-cross-replica-split.test.js | 443 ++++++ test/src/websocket/ws-session-cleanup.test.js | 73 + test/src/websocket/ws-session-expiry.test.js | 160 ++ test/support/ws-session-harness.js | 75 +- 12 files changed, 2132 insertions(+), 339 deletions(-) create mode 100644 test/src/websocket/ws-cross-replica-split.test.js create mode 100644 test/src/websocket/ws-session-cleanup.test.js create mode 100644 test/src/websocket/ws-session-expiry.test.js diff --git a/src/services/amqp-relay-transport.js b/src/services/amqp-relay-transport.js index 0bfb95a3..56a1b6a3 100644 --- a/src/services/amqp-relay-transport.js +++ b/src/services/amqp-relay-transport.js @@ -51,6 +51,24 @@ class AmqpRelayTransport extends WsRelayTransport { return this._queueService.shouldUseQueue(execId) } + setExecUserDeliveryHook (execId, hook) { + if (typeof this._queueService.setExecUserDeliveryHook === 'function') { + this._queueService.setExecUserDeliveryHook(execId, hook) + } + } + + setExecAgentDeliveryHook (execId, hook) { + if (typeof this._queueService.setExecAgentDeliveryHook === 'function') { + this._queueService.setExecAgentDeliveryHook(execId, hook) + } + } + + setLogUserDeliveryHook (sessionId, hook) { + if (typeof this._queueService.setLogUserDeliveryHook === 'function') { + this._queueService.setLogUserDeliveryHook(sessionId, hook) + } + } + shouldUseRelayForLogs (sessionId) { return this._queueService.shouldUseQueueForLogs(sessionId) } diff --git a/src/services/nats-relay-transport-impl.js b/src/services/nats-relay-transport-impl.js index 3929b488..babb747a 100644 --- a/src/services/nats-relay-transport-impl.js +++ b/src/services/nats-relay-transport-impl.js @@ -110,6 +110,27 @@ class NatsRelayTransportImpl { return this.execBridges.has(execId) } + setExecUserDeliveryHook (execId, hook) { + const bridge = this.execBridges.get(execId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + } + + setExecAgentDeliveryHook (execId, hook) { + const bridge = this.execBridges.get(execId) + if (bridge) { + bridge.onAgentRelayDelivery = hook + } + } + + setLogUserDeliveryHook (sessionId, hook) { + const bridge = this.logBridges.get(sessionId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + } + async publishToAgent (execId, buffer, options = {}) { await this._publishExec(execId, execAgentSubject(execId), buffer, options) } @@ -391,6 +412,26 @@ class NatsRelayTransportImpl { side, messageSize: body.length }) + if (side === 'user' && currentBridge.onUserRelayDelivery) { + try { + currentBridge.onUserRelayDelivery(body) + } catch (error) { + logger.warn('[NATS][RELAY] Exec user relay delivery hook failed', { + execId: bridge.execId, + error: error.message + }) + } + } + if (side === 'agent' && currentBridge.onAgentRelayDelivery) { + try { + currentBridge.onAgentRelayDelivery(body) + } catch (error) { + logger.warn('[NATS][RELAY] Exec agent relay delivery hook failed', { + execId: bridge.execId, + error: error.message + }) + } + } } else { logger.debug('[NATS][RELAY] No socket available for exec delivery', { execId: bridge.execId, @@ -413,6 +454,10 @@ class NatsRelayTransportImpl { closeAck }) + if (closeAck) { + return + } + if (ws && ws.readyState === WebSocket.OPEN) { try { const reason = closeInitiator === 'agent' ? 'Agent closed connection' : 'User closed connection' @@ -424,9 +469,18 @@ class NatsRelayTransportImpl { error: error.message }) } + } else if (bridge && bridge.cleanupCallback) { + try { + await bridge.cleanupCallback(execId) + } catch (error) { + logger.error('[NATS][RELAY] Error in cleanup callback during CLOSE handling', { + execId, + error: error.message + }) + } } - if (!closeAck && this.execBridges.has(execId)) { + if (this.execBridges.has(execId)) { const ackSide = side === 'user' ? 'agent' : 'user' try { const hdrs = natsHeaders() @@ -442,17 +496,6 @@ class NatsRelayTransportImpl { }) } } - - if (bridge && bridge.cleanupCallback) { - try { - await bridge.cleanupCallback(execId) - } catch (error) { - logger.error('[NATS][RELAY] Error in cleanup callback during CLOSE handling', { - execId, - error: error.message - }) - } - } } async _ensureLogUserSubscription (bridge, userWs, nc) { @@ -525,6 +568,16 @@ class NatsRelayTransportImpl { ws.send(body, { binary: true }) currentBridge.pendingBytes = Math.max(0, currentBridge.pendingBytes - body.length) currentBridge.pendingMessages = Math.max(0, currentBridge.pendingMessages - 1) + if (currentBridge.onUserRelayDelivery) { + try { + currentBridge.onUserRelayDelivery(body) + } catch (error) { + logger.warn('[NATS][RELAY] Log user relay delivery hook failed', { + sessionId: bridge.sessionId, + error: error.message + }) + } + } } else { currentBridge.pendingBytes = Math.max(0, currentBridge.pendingBytes - body.length) currentBridge.pendingMessages = Math.max(0, currentBridge.pendingMessages - 1) diff --git a/src/services/nats-relay-transport.js b/src/services/nats-relay-transport.js index e7d47e51..4fc3a432 100644 --- a/src/services/nats-relay-transport.js +++ b/src/services/nats-relay-transport.js @@ -50,6 +50,24 @@ class NatsRelayTransport extends WsRelayTransport { return this._impl.shouldUseRelay(execId) } + setExecUserDeliveryHook (execId, hook) { + if (typeof this._impl.setExecUserDeliveryHook === 'function') { + this._impl.setExecUserDeliveryHook(execId, hook) + } + } + + setExecAgentDeliveryHook (execId, hook) { + if (typeof this._impl.setExecAgentDeliveryHook === 'function') { + this._impl.setExecAgentDeliveryHook(execId, hook) + } + } + + setLogUserDeliveryHook (sessionId, hook) { + if (typeof this._impl.setLogUserDeliveryHook === 'function') { + this._impl.setLogUserDeliveryHook(sessionId, hook) + } + } + shouldUseRelayForLogs (sessionId) { return this._impl.shouldUseRelayForLogs(sessionId) } diff --git a/src/services/websocket-queue-service.js b/src/services/websocket-queue-service.js index 595c9124..641461fa 100644 --- a/src/services/websocket-queue-service.js +++ b/src/services/websocket-queue-service.js @@ -123,6 +123,27 @@ class WebSocketQueueService { return this.execBridges.has(execId) } + setExecUserDeliveryHook (execId, hook) { + const bridge = this.execBridges.get(execId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + } + + setExecAgentDeliveryHook (execId, hook) { + const bridge = this.execBridges.get(execId) + if (bridge) { + bridge.onAgentRelayDelivery = hook + } + } + + setLogUserDeliveryHook (sessionId, hook) { + const bridge = this.logBridges.get(sessionId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + } + async publishToAgent (execId, buffer, options = {}) { await this._send(execId, 'agent', buffer, options) } @@ -422,6 +443,26 @@ class WebSocketQueueService { side, messageSize: body.length }) + if (side === 'user' && currentBridge.onUserRelayDelivery) { + try { + currentBridge.onUserRelayDelivery(body) + } catch (error) { + logger.warn('[AMQP][QUEUE] Exec user relay delivery hook failed', { + execId: session.execId, + error: error.message + }) + } + } + if (side === 'agent' && currentBridge.onAgentRelayDelivery) { + try { + currentBridge.onAgentRelayDelivery(body) + } catch (error) { + logger.warn('[AMQP][QUEUE] Exec agent relay delivery hook failed', { + execId: session.execId, + error: error.message + }) + } + } } catch (error) { logger.error('[AMQP][QUEUE] Failed to deliver message to socket', { execId: session.execId, @@ -482,6 +523,11 @@ class WebSocketQueueService { closeAck }) + if (closeAck) { + context.delivery.accept() + return + } + if (ws && ws.readyState === WebSocket.OPEN) { try { const reason = closeInitiator === 'agent' ? 'Agent closed connection' : 'User closed connection' @@ -504,11 +550,21 @@ class WebSocketQueueService { hasSocket: !!ws, socketState: ws ? ws.readyState : 'N/A' }) + if (bridge && bridge.cleanupCallback) { + try { + await bridge.cleanupCallback(execId) + } catch (error) { + logger.error('[AMQP][QUEUE] Error in cleanup callback during CLOSE handling', { + execId, + error: error.message + }) + } + } } context.delivery.accept() - if (!closeAck && this.execBridges.has(execId)) { + if (this.execBridges.has(execId)) { const ackSide = side === 'user' ? 'agent' : 'user' try { await this._send(execId, ackSide, body, { @@ -527,17 +583,6 @@ class WebSocketQueueService { }) } } - - if (bridge && bridge.cleanupCallback) { - try { - await bridge.cleanupCallback(execId) - } catch (error) { - logger.error('[AMQP][QUEUE] Error in cleanup callback during CLOSE handling', { - execId, - error: error.message - }) - } - } } async enableForLogSession (session, cleanupCallback) { @@ -777,6 +822,16 @@ class WebSocketQueueService { } ws.send(body, { binary: true }) context.delivery.accept() + if (currentBridge.onUserRelayDelivery) { + try { + currentBridge.onUserRelayDelivery(body) + } catch (error) { + logger.warn('[AMQP][QUEUE] Log user relay delivery hook failed', { + sessionId: session.sessionId, + error: error.message + }) + } + } } else { context.delivery.release() } diff --git a/src/websocket/exec-session-manager.js b/src/websocket/exec-session-manager.js index c4c11a3c..048f7bcd 100644 --- a/src/websocket/exec-session-manager.js +++ b/src/websocket/exec-session-manager.js @@ -16,6 +16,7 @@ class ExecSessionManager { this.execSessions = new Map() this.config = config this.cleanupInterval = null + this.expiredSessionHandler = null this.startCleanupInterval() logger.info('ExecSessionManager initialized with config:' + JSON.stringify({ execPendingTimeoutMs: config.session.execPendingTimeoutMs, @@ -47,7 +48,14 @@ class ExecSessionManager { createdAt: Date.now(), transaction, queueBridgeEnabled: false, - metricsActive: false + metricsActive: false, + activationSent: false, + remoteAgentPaired: false, + remoteUserPaired: false, + pendingPairingTimer: null, + pairingStartedAt: null, + pairingMetricsStarted: false, + pairingCompleted: false } this.execSessions.set(sessionId, session) return session @@ -67,6 +75,10 @@ class ExecSessionManager { return sessions } + setExpiredSessionHandler (handler) { + this.expiredSessionHandler = typeof handler === 'function' ? handler : null + } + updateLastActivity (sessionId) { const session = this.execSessions.get(sessionId) if (session) { @@ -74,9 +86,19 @@ class ExecSessionManager { } } + detachLocalExecSession (sessionId) { + const session = this.execSessions.get(sessionId) + if (!session || session.removing) { + return + } + this.execSessions.delete(sessionId) + } + async removeExecSession (sessionId, transaction) { const session = this.execSessions.get(sessionId) - if (!session) return + if (!session || session.removing) return + + session.removing = true if (session.agent && session.agent.readyState === WebSocket.OPEN) { session.agent.close() @@ -125,8 +147,12 @@ class ExecSessionManager { let isExpired = false - if (!session.agent && session.user) { + if (!session.agent && !session.remoteAgentPaired && session.user) { isExpired = timeSinceCreation > pendingTimeout + } else if (session.user && !session.agent && session.remoteAgentPaired) { + isExpired = timeSinceLastActivity > maxDuration + } else if (session.agent && !session.user && session.remoteUserPaired) { + isExpired = timeSinceLastActivity > maxDuration } else if (session.agent && !session.user) { isExpired = timeSinceLastActivity > pendingTimeout } else if (session.agent && session.user) { @@ -141,28 +167,36 @@ class ExecSessionManager { } for (const sessionId of expiredSessions) { - logger.info('Cleaning up expired exec session:' + JSON.stringify({ sessionId })) const session = this.execSessions.get(sessionId) - if (session && session.user && session.user.readyState === WebSocket.OPEN) { - try { - session.user.close(1008, session.agent ? 'Exec session max duration exceeded' : 'Timeout waiting for agent connection') - } catch (error) { - logger.warn('Failed to close expired exec user connection:' + error.message) - } - } - if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { - try { - session.agent.close(1000, 'Exec session expired') - } catch (error) { - logger.warn('Failed to close expired exec agent connection:' + error.message) - } + if (this.expiredSessionHandler) { + await this.expiredSessionHandler(sessionId, session, transaction) + } else { + await this._removeExpiredExecSession(sessionId, session, transaction) } - await this.removeExecSession(sessionId, transaction) } return expiredSessions.length } + async _removeExpiredExecSession (sessionId, session, transaction) { + logger.info('Cleaning up expired exec session:' + JSON.stringify({ sessionId })) + if (session && session.user && session.user.readyState === WebSocket.OPEN) { + try { + session.user.close(1008, (session.agent || session.remoteAgentPaired) ? 'Exec session max duration exceeded' : 'Timeout waiting for agent connection') + } catch (error) { + logger.warn('Failed to close expired exec user connection:' + error.message) + } + } + if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { + try { + session.agent.close(1000, 'Exec session expired') + } catch (error) { + logger.warn('Failed to close expired exec agent connection:' + error.message) + } + } + await this.removeExecSession(sessionId, transaction) + } + startCleanupInterval () { const interval = this.config.session.cleanupInterval || 30000 this.cleanupInterval = setInterval(async () => { diff --git a/src/websocket/log-session-manager.js b/src/websocket/log-session-manager.js index 2f55d08a..9f10b5df 100644 --- a/src/websocket/log-session-manager.js +++ b/src/websocket/log-session-manager.js @@ -17,6 +17,7 @@ class LogSessionManager { this.logSessions = new Map() // Map this.config = config this.cleanupInterval = null + this.expiredSessionHandler = null this.startCleanupInterval() logger.info('LogSessionManager initialized with config:' + JSON.stringify({ logPendingTimeoutMs: config.session.logPendingTimeoutMs, @@ -47,7 +48,13 @@ class LogSessionManager { tailConfig, // Per-session tail configuration lastActivity: Date.now(), createdAt: Date.now(), - transaction + transaction, + remoteAgentPaired: false, + remoteUserPaired: false, + pendingPairingTimer: null, + pairingStartedAt: null, + pairingMetricsStarted: false, + pairingCompleted: false } this.logSessions.set(sessionId, session) return session @@ -68,6 +75,10 @@ class LogSessionManager { return sessions } + setExpiredSessionHandler (handler) { + this.expiredSessionHandler = typeof handler === 'function' ? handler : null + } + updateLastActivity (sessionId) { const session = this.logSessions.get(sessionId) if (session) { @@ -75,9 +86,19 @@ class LogSessionManager { } } + detachLocalLogSession (sessionId) { + const session = this.logSessions.get(sessionId) + if (!session || session.removing) { + return + } + this.logSessions.delete(sessionId) + } + async removeLogSession (sessionId, transaction) { const session = this.logSessions.get(sessionId) - if (!session) return + if (!session || session.removing) return + + session.removing = true // Close connections if (session.agent && session.agent.readyState === WebSocket.OPEN) { @@ -149,8 +170,12 @@ class LogSessionManager { let isExpired = false - if (!session.agent && session.user) { + if (!session.agent && !session.remoteAgentPaired && session.user) { isExpired = timeSinceCreation > pendingTimeout + } else if (session.user && !session.agent && session.remoteAgentPaired) { + isExpired = timeSinceLastActivity > idleTimeout + } else if (session.agent && !session.user && session.remoteUserPaired) { + isExpired = timeSinceLastActivity > idleTimeout } else if (session.agent && !session.user) { isExpired = timeSinceLastActivity > pendingTimeout } else if (session.agent && session.user) { @@ -165,28 +190,36 @@ class LogSessionManager { } for (const sessionId of expiredSessions) { - logger.info('Cleaning up expired log session:' + JSON.stringify({ sessionId })) const session = this.logSessions.get(sessionId) - if (session && session.user && session.user.readyState === WebSocket.OPEN) { - try { - session.user.close(1008, session.agent ? 'Log session idle timeout' : 'Timeout waiting for agent connection') - } catch (error) { - logger.warn('Failed to close expired log user connection:' + error.message) - } - } - if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { - try { - session.agent.close(1000, 'Log session expired') - } catch (error) { - logger.warn('Failed to close expired log agent connection:' + error.message) - } + if (this.expiredSessionHandler) { + await this.expiredSessionHandler(sessionId, session, transaction) + } else { + await this._removeExpiredLogSession(sessionId, session, transaction) } - await this.removeLogSession(sessionId, transaction) } return expiredSessions.length } + async _removeExpiredLogSession (sessionId, session, transaction) { + logger.info('Cleaning up expired log session:' + JSON.stringify({ sessionId })) + if (session && session.user && session.user.readyState === WebSocket.OPEN) { + try { + session.user.close(1008, (session.agent || session.remoteAgentPaired) ? 'Log session idle timeout' : 'Timeout waiting for agent connection') + } catch (error) { + logger.warn('Failed to close expired log user connection:' + error.message) + } + } + if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { + try { + session.agent.close(1000, 'Log session expired') + } catch (error) { + logger.warn('Failed to close expired log agent connection:' + error.message) + } + } + await this.removeLogSession(sessionId, transaction) + } + startCleanupInterval () { const interval = this.config.session.cleanupInterval || 30000 // Default 30 seconds this.cleanupInterval = setInterval(async () => { diff --git a/src/websocket/server.js b/src/websocket/server.js index 75f7af27..cab2466d 100644 --- a/src/websocket/server.js +++ b/src/websocket/server.js @@ -11,11 +11,15 @@ const MicroserviceStatusManager = require('../data/managers/microservice-status- const { microserviceState } = require('../enums/microservice-state') const AuthDecorator = require('../decorators/authorization-decorator') const TransactionDecorator = require('../decorators/transaction-decorator') +const transactionRunner = require('../helpers/transaction-runner') +const { PRIORITY_BACKGROUND } = transactionRunner const msgpack = require('@msgpack/msgpack') const { resolveTransport } = require('../services/ws-relay-transport-factory') const { recordExecSessionActive, - recordLogSessionActive + recordLogSessionActive, + recordPendingPairing, + recordPairingDurationMs } = require('./ws-metrics') const AppHelper = require('../helpers/app-helper') const MicroserviceLogStatusManager = require('../data/managers/microservice-log-status-manager') @@ -44,6 +48,9 @@ const DRAIN_CLOSE_CODE = 1001 const DRAIN_CLOSE_REASON = 'Server draining' // when user WS bufferedAmount exceeds this, drop LOG_LINE silently and emit LOG_ERROR once. const LOG_BACKPRESSURE_BUFFER_BYTES = 256 * 1024 +const EXEC_AGENT_READY_NOTICE = 'Agent connected. Interactive exec is ready.\n' +const LOG_AGENT_READY_NOTICE = 'Agent connected. Log streaming started.\n' +const LOG_AGENT_DISCONNECTED_NOTICE = 'Agent disconnected.\n' const EventService = require('../services/event-service') const { isAuthConfigured: isOidcAuthConfigured } = require('../config/oidc') @@ -183,6 +190,8 @@ class WebSocketServer { } }) this.pendingCloseTimeouts = new Map() // Track pending CLOSE messages in cross-replica scenarios + this._execCleanupInflight = new Map() + this._logCleanupInflight = new Map() this.haConfig = config.get('server.webSocket.ha') || {} this.isDraining = false this.drainPromise = null @@ -212,6 +221,11 @@ class WebSocketServer { } }) } + + this.execSessionManager.setExpiredSessionHandler((sessionId, session, transaction) => + this._handleExpiredExecSession(sessionId, session, transaction)) + this.logSessionManager.setExpiredSessionHandler((sessionId, session, transaction) => + this._handleExpiredLogSession(sessionId, session, transaction)) } // MessagePack encoding/decoding helpers with improved error handling @@ -354,81 +368,884 @@ class WebSocketServer { type: promise && promise.constructor ? promise.constructor.name : typeof promise } }) - // Don't let the error crash the process + // Don't let the error crash the process + }) + + processErrorHandlersRegistered = true + } + } + + getLogConcurrencyLimit () { + return this.sessionConfig.logMaxConcurrentPerResource || 3 + } + + getExecConcurrencyLimit () { + return this.sessionConfig.execMaxConcurrentPerResource || 3 + } + + getLogTailMaxLines () { + return this.sessionConfig.logTailMaxLines || 5000 + } + + getExecPendingTimeoutMs () { + return this.sessionConfig.execPendingTimeoutMs || 60000 + } + + getLogPendingTimeoutMs () { + return this.sessionConfig.logPendingTimeoutMs || 120000 + } + + getDrainTimeoutMs () { + return this.sessionConfig.drainTimeoutMs || 30000 + } + + isCrossReplicaSession (session) { + return !!(session && (!session.agent || !session.user)) + } + + async requireRelayForCrossReplica (ws) { + if (this.haConfig.failFastOnRouterUnavailable === false) { + return true + } + const available = await this.relayTransport.isAvailable() + if (!available) { + logger.warn('[RELAY] Relay backend unavailable for cross-replica session', { + transport: this.relayTransport.getTransport() + }) + if (ws && ws.readyState === WebSocket.OPEN) { + ws.close(RELAY_UNAVAILABLE_CLOSE_CODE, RELAY_UNAVAILABLE_CLOSE_REASON) + } + return false + } + return true + } + + _scheduleRelaySetupAfterCommit (label, setupFn) { + setImmediate(async () => { + try { + await setupFn() + } catch (error) { + logger.error(`Failed to ${label}:` + JSON.stringify({ + error: error.message, + stack: error.stack + })) + } + }) + } + + _runDedupedSessionCleanup (inflightMap, sessionId, label, cleanupFn) { + const existing = inflightMap.get(sessionId) + if (existing) { + return existing + } + + const promise = transactionRunner.runInTransaction( + cleanupFn, + { priority: PRIORITY_BACKGROUND, label } + ).finally(() => { + inflightMap.delete(sessionId) + }) + + inflightMap.set(sessionId, promise) + return promise + } + + async _cleanupLogSessionInTransaction (sessionId) { + return this._runDedupedSessionCleanup( + this._logCleanupInflight, + sessionId, + 'ws.log.cleanup', + (transaction) => this.cleanupLogSession(sessionId, transaction) + ) + } + + async _cleanupExecSessionInTransaction (sessionId) { + return this._runDedupedSessionCleanup( + this._execCleanupInflight, + sessionId, + 'ws.exec.cleanup', + (transaction) => this.cleanupExecSession(sessionId, transaction) + ) + } + + _isExecSessionAgentPaired (session) { + return !!(session && (session.agent || session.remoteAgentPaired)) + } + + _isExecSessionUserPaired (session) { + return !!(session && (session.user || session.remoteUserPaired)) + } + + _isLogSessionAgentPaired (session) { + return !!(session && (session.agent || session.remoteAgentPaired)) + } + + _isLogSessionUserPaired (session) { + return !!(session && (session.user || session.remoteUserPaired)) + } + + _clearPendingPairingTimer (session) { + if (session && session.pendingPairingTimer) { + clearTimeout(session.pendingPairingTimer) + session.pendingPairingTimer = null + } + } + + _startExecPendingPairingMetrics (session) { + if (!session || session.pairingMetricsStarted) { + return + } + session.pairingMetricsStarted = true + session.pairingStartedAt = Date.now() + recordPendingPairing(1) + } + + _startLogPendingPairingMetrics (session) { + if (!session || session.pairingMetricsStarted) { + return + } + session.pairingMetricsStarted = true + session.pairingStartedAt = Date.now() + recordPendingPairing(1) + } + + _recordPairingCompleted (session) { + if (!session || session.pairingCompleted || !session.pairingMetricsStarted) { + return + } + session.pairingCompleted = true + recordPendingPairing(-1) + if (session.pairingStartedAt != null) { + recordPairingDurationMs(Date.now() - session.pairingStartedAt) + } + } + + _abortPendingPairingMetrics (session) { + if (!session || session.pairingCompleted || !session.pairingMetricsStarted) { + return + } + session.pairingCompleted = true + recordPendingPairing(-1) + } + + _markExecAgentPaired (sessionId, { notifyUser = false, source = 'local' } = {}) { + const session = this.execSessionManager.getExecSession(sessionId) + if (!session) { + return false + } + const wasPaired = this._isExecSessionAgentPaired(session) + session.remoteAgentPaired = true + this._clearPendingPairingTimer(session) + if (!wasPaired) { + this._recordPairingCompleted(session) + } + if (notifyUser && session.user && session.user.readyState === WebSocket.OPEN) { + try { + const readyMsg = { + type: MESSAGE_TYPES.STDERR, + data: Buffer.from(EXEC_AGENT_READY_NOTICE), + sessionId, + microserviceUuid: session.microserviceUuid, + execId: sessionId, + timestamp: Date.now() + } + session.user.send(this.encodeMessage(readyMsg), { binary: true }) + } catch (error) { + logger.warn('Failed to notify user that exec agent connected:' + JSON.stringify({ + sessionId, + source, + error: error.message + })) + } + } + return true + } + + _markExecUserPaired (sessionId, { source = 'relay-notify' } = {}) { + const session = this.execSessionManager.getExecSession(sessionId) + if (!session) { + return false + } + session.remoteUserPaired = true + session.lastActivity = Date.now() + logger.info('Exec remote user paired:' + JSON.stringify({ sessionId, source })) + return true + } + + _markLogUserPaired (sessionId, { source = 'relay-notify' } = {}) { + const session = this.logSessionManager.getLogSession(sessionId) + if (!session) { + return false + } + session.remoteUserPaired = true + session.lastActivity = Date.now() + logger.info('Log remote user paired:' + JSON.stringify({ + sessionId, + source, + microserviceUuid: session.microserviceUuid, + fogUuid: session.fogUuid + })) + return true + } + + _markLogAgentPaired (sessionId, { notifyUser = false, source = 'local' } = {}) { + const session = this.logSessionManager.getLogSession(sessionId) + if (!session) { + return false + } + const wasPaired = this._isLogSessionAgentPaired(session) + session.remoteAgentPaired = true + this._clearPendingPairingTimer(session) + if (!wasPaired) { + this._recordPairingCompleted(session) + } + if (notifyUser && session.user && session.user.readyState === WebSocket.OPEN) { + try { + const agentConnectedMsg = { + type: MESSAGE_TYPES.LOG_LINE, + data: Buffer.from(LOG_AGENT_READY_NOTICE), + sessionId, + timestamp: Date.now(), + microserviceUuid: session.microserviceUuid || null, + iofogUuid: session.fogUuid || null + } + session.user.send(this.encodeMessage(agentConnectedMsg), { binary: true }) + logger.info('Notified user that agent connected for log session:' + JSON.stringify({ + sessionId, + source, + microserviceUuid: session.microserviceUuid, + fogUuid: session.fogUuid + })) + } catch (error) { + logger.warn('Failed to notify user that log agent connected:' + JSON.stringify({ + sessionId, + source, + error: error.message + })) + } + } + return true + } + + async _sendExecActivationViaRelay (sessionId, microserviceUuid) { + if (!this.relayTransport.shouldUseRelay(sessionId)) { + return false + } + const activationMsg = { + type: MESSAGE_TYPES.ACTIVATION, + data: Buffer.from(JSON.stringify({ + sessionId, + execId: sessionId, + microserviceUuid, + timestamp: Date.now() + })), + sessionId, + microserviceUuid, + execId: sessionId, + timestamp: Date.now() + } + try { + await this.relayTransport.publishToAgent( + sessionId, + this.encodeMessage(activationMsg), + { messageType: MESSAGE_TYPES.ACTIVATION } + ) + const session = this.execSessionManager.getExecSession(sessionId) + if (session) { + session.activationSent = true + } + logger.info('[RELAY] Exec activation published to agent via relay:' + JSON.stringify({ + sessionId, + microserviceUuid + })) + return true + } catch (error) { + logger.error('[RELAY] Failed to publish exec activation via relay:' + JSON.stringify({ + sessionId, + microserviceUuid, + error: error.message + })) + return false + } + } + + async _notifyExecUserViaRelay (sessionId, microserviceUuid) { + if (!this.relayTransport.shouldUseRelay(sessionId)) { + return false + } + const readyMsg = { + type: MESSAGE_TYPES.STDERR, + data: Buffer.from(EXEC_AGENT_READY_NOTICE), + sessionId, + microserviceUuid, + execId: sessionId, + timestamp: Date.now() + } + try { + await this.relayTransport.publishToUser( + sessionId, + this.encodeMessage(readyMsg), + { messageType: MESSAGE_TYPES.STDERR } + ) + logger.info('[RELAY] Exec user ready notice published via relay:' + JSON.stringify({ + sessionId, + microserviceUuid + })) + return true + } catch (error) { + logger.error('[RELAY] Failed to notify exec user via relay:' + JSON.stringify({ + sessionId, + microserviceUuid, + error: error.message + })) + return false + } + } + + async _notifyLogUserViaRelay (sessionId, { microserviceUuid, fogUuid, message }) { + if (!this.relayTransport.shouldUseRelayForLogs(sessionId)) { + return false + } + const notifyMsg = { + type: MESSAGE_TYPES.LOG_LINE, + data: Buffer.from(message), + sessionId, + timestamp: Date.now(), + microserviceUuid: microserviceUuid || null, + iofogUuid: fogUuid || null + } + try { + await this.relayTransport.publishLogToUser(sessionId, this.encodeMessage(notifyMsg)) + return true + } catch (error) { + logger.error('[RELAY] Failed to notify log user via relay:' + JSON.stringify({ + sessionId, + microserviceUuid, + fogUuid, + error: error.message + })) + return false + } + } + + _registerExecUserRelayPairingHook (sessionId) { + if (typeof this.relayTransport.setExecUserDeliveryHook !== 'function') { + return + } + this.relayTransport.setExecUserDeliveryHook(sessionId, (buffer) => { + this._onExecUserRelayDelivery(sessionId, buffer) + }) + } + + _registerExecAgentRelayActivityHook (sessionId) { + if (typeof this.relayTransport.setExecAgentDeliveryHook !== 'function') { + return + } + this.relayTransport.setExecAgentDeliveryHook(sessionId, (buffer) => { + this._onExecAgentRelayDelivery(sessionId, buffer) + }) + } + + _registerLogUserRelayPairingHook (sessionId) { + if (typeof this.relayTransport.setLogUserDeliveryHook !== 'function') { + return + } + this.relayTransport.setLogUserDeliveryHook(sessionId, (buffer) => { + this._onLogUserRelayDelivery(sessionId, buffer) + }) + } + + _onExecUserRelayDelivery (sessionId, buffer) { + const session = this.execSessionManager.getExecSession(sessionId) + if (session) { + session.lastActivity = Date.now() + } + try { + const msg = this.decodeMessage(buffer) + if (msg.type === MESSAGE_TYPES.STDERR && msg.data) { + const text = msg.data.toString() + if (text.includes('Interactive exec is ready')) { + this._markExecAgentPaired(sessionId, { notifyUser: false, source: 'relay-notify' }) + } + } + } catch (error) { + logger.debug('Ignoring exec user relay delivery hook decode error:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + } + + _onExecAgentRelayDelivery (sessionId, buffer) { + const session = this.execSessionManager.getExecSession(sessionId) + if (session) { + session.lastActivity = Date.now() + } + try { + this.decodeMessage(buffer) + } catch (error) { + logger.debug('Ignoring exec agent relay delivery hook decode error:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + } + + _onLogUserRelayDelivery (sessionId, buffer) { + const session = this.logSessionManager.getLogSession(sessionId) + if (session) { + session.lastActivity = Date.now() + } + try { + const msg = this.decodeMessage(buffer) + if (msg.type === MESSAGE_TYPES.LOG_LINE && msg.data) { + const text = msg.data.toString() + if (text.includes('Log streaming started')) { + this._markLogAgentPaired(sessionId, { notifyUser: false, source: 'relay-notify' }) + } + } + } catch (error) { + logger.debug('Ignoring log user relay delivery hook decode error:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + } + + async _checkExecAgentPairedInDb (sessionId, transaction) { + const row = await MicroserviceExecSessionManager.findBySessionId(sessionId, transaction) + return !!(row && row.agentConnected) + } + + async _checkExecUserConnectedInDb (sessionId, transaction) { + const row = await MicroserviceExecSessionManager.findBySessionId(sessionId, transaction) + return !!(row && row.userConnected) + } + + async _checkLogAgentPairedInDb (sessionId, microserviceUuid, fogUuid, transaction) { + let row = null + if (microserviceUuid) { + row = await MicroserviceLogStatusManager.findOne({ sessionId }, transaction) + } else if (fogUuid) { + row = await FogLogStatusManager.findOne({ sessionId }, transaction) + } + return !!(row && row.agentConnected) + } + + async _notifyExecRemotePeerClose (sessionId, session, reason = 'Exec session expired') { + if (!session || !this.relayTransport.shouldUseRelay(sessionId)) { + return + } + + const closeMsg = { + type: MESSAGE_TYPES.CLOSE, + execId: sessionId, + sessionId, + microserviceUuid: session.microserviceUuid, + timestamp: Date.now(), + data: Buffer.from(reason) + } + const encoded = this.encodeMessage(closeMsg) + + try { + if (session.agent && !session.user && session.remoteUserPaired) { + await this.relayTransport.publishToUser(sessionId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) + } else if (session.user && !session.agent && session.remoteAgentPaired) { + await this.relayTransport.publishToAgent(sessionId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) + } + } catch (error) { + logger.error('[WS-CLOSE] Failed to notify remote exec peer via relay during session close', { + sessionId, + error: error.message + }) + } + } + + async _handleExpiredExecSession (sessionId, session, transaction) { + logger.info('Cleaning up expired exec session:' + JSON.stringify({ sessionId })) + if (session && session.user && session.user.readyState === WebSocket.OPEN) { + try { + session.user.close( + 1008, + (session.agent || session.remoteAgentPaired) + ? 'Exec session max duration exceeded' + : 'Timeout waiting for agent connection' + ) + } catch (error) { + logger.warn('Failed to close expired exec user connection:' + error.message) + } + } + if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { + try { + session.agent.close(1000, 'Exec session expired') + } catch (error) { + logger.warn('Failed to close expired exec agent connection:' + error.message) + } + } + + await this._notifyExecRemotePeerClose(sessionId, session, 'Exec session expired') + await this._cleanupExecSessionInTransaction(sessionId) + } + + async _handleExpiredLogSession (sessionId, session, transaction) { + logger.info('Cleaning up expired log session:' + JSON.stringify({ sessionId })) + if (session && session.user && session.user.readyState === WebSocket.OPEN) { + try { + session.user.close( + 1008, + (session.agent || session.remoteAgentPaired) + ? 'Log session idle timeout' + : 'Timeout waiting for agent connection' + ) + } catch (error) { + logger.warn('Failed to close expired log user connection:' + error.message) + } + } + if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { + try { + session.agent.close(1000, 'Log session expired') + } catch (error) { + logger.warn('Failed to close expired log agent connection:' + error.message) + } + } + + if (session && this.relayTransport.shouldUseRelayForLogs(sessionId)) { + if (session.agent && !session.user && session.remoteUserPaired) { + await this._notifyLogUserViaRelay(sessionId, { + microserviceUuid: session.microserviceUuid, + fogUuid: session.fogUuid, + message: 'Log session ended.\n' + }).catch((error) => { + logger.error('[WS-CLOSE] Failed to notify remote log user via relay during session expiry', { + sessionId, + error: error.message + }) + }) + } + } + + await this._cleanupLogSessionInTransaction(sessionId) + } + + _scheduleExecPendingPairingTimeout (sessionId, userWs, microserviceUuid) { + const session = this.execSessionManager.getExecSession(sessionId) + if (!session) { + return + } + this._clearPendingPairingTimer(session) + const timeoutMs = this.getExecPendingTimeoutMs() + session.pendingPairingTimer = setTimeout(() => { + this._handleExecPendingTimeout(sessionId, userWs, microserviceUuid).catch((error) => { + logger.warn('Exec pending timeout handler failed:' + error.message) + }) + }, timeoutMs) + } + + _scheduleLogPendingPairingTimeout (sessionId, userWs, microserviceUuid, fogUuid) { + const session = this.logSessionManager.getLogSession(sessionId) + if (!session) { + return + } + this._clearPendingPairingTimer(session) + const timeoutMs = this.getLogPendingTimeoutMs() + session.pendingPairingTimer = setTimeout(() => { + this._handleLogPendingTimeout(sessionId, userWs, microserviceUuid, fogUuid).catch((error) => { + logger.warn('Log pending timeout handler failed:' + error.message) + }) + }, timeoutMs) + } + + async _handleExecPendingTimeout (sessionId, userWs, microserviceUuid) { + const session = this.execSessionManager.getExecSession(sessionId) + if (!session || this._isExecSessionAgentPaired(session)) { + return + } + + try { + const pairedInDb = await transactionRunner.runInTransaction( + (tx) => this._checkExecAgentPairedInDb(sessionId, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.exec.pending-db-check' } + ) + if (pairedInDb) { + this._markExecAgentPaired(sessionId, { notifyUser: false, source: 'db-fallback' }) + return + } + } catch (error) { + logger.warn('Exec pending timeout DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + + this._abortPendingPairingMetrics(session) + logger.warn('Exec session pending timeout:' + JSON.stringify({ + sessionId, + microserviceUuid, + timeout: this.getExecPendingTimeoutMs() + })) + try { + if (userWs.readyState === WebSocket.OPEN) { + const timeoutMsg = { + type: MESSAGE_TYPES.STDERR, + data: Buffer.from('Timeout waiting for agent connection.\n'), + sessionId, + microserviceUuid, + execId: sessionId, + timestamp: Date.now() + } + userWs.send(this.encodeMessage(timeoutMsg), { binary: true }) + userWs.close(1008, 'Timeout waiting for agent connection') + } + } catch (error) { + logger.warn('Failed to close exec session on pending timeout:' + error.message) + } + } + + async _handleLogPendingTimeout (sessionId, userWs, microserviceUuid, fogUuid) { + const session = this.logSessionManager.getLogSession(sessionId) + if (!session || this._isLogSessionAgentPaired(session)) { + return + } + + try { + const pairedInDb = await transactionRunner.runInTransaction( + (tx) => this._checkLogAgentPairedInDb(sessionId, microserviceUuid, fogUuid, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.log.pending-db-check' } + ) + if (pairedInDb) { + this._markLogAgentPaired(sessionId, { notifyUser: false, source: 'db-fallback' }) + return + } + } catch (error) { + logger.warn('Log pending timeout DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + + this._abortPendingPairingMetrics(session) + logger.warn('Log session pending timeout:' + JSON.stringify({ + sessionId, + microserviceUuid, + fogUuid, + timeout: this.getLogPendingTimeoutMs() + })) + try { + if (userWs.readyState === WebSocket.OPEN) { + const timeoutMsg = { + type: MESSAGE_TYPES.LOG_LINE, + data: Buffer.from('Timeout waiting for agent connection.\n'), + sessionId, + timestamp: Date.now(), + microserviceUuid: microserviceUuid || null, + iofogUuid: fogUuid || null + } + userWs.send(this.encodeMessage(timeoutMsg), { binary: true }) + userWs.close(1008, 'Timeout waiting for agent connection') + } + } catch (error) { + logger.warn('Failed to close log session on pending timeout:' + error.message) + } + } + + async _detachExecSessionLocal (sessionId) { + const session = this.execSessionManager.getExecSession(sessionId) + if (session) { + this._clearPendingPairingTimer(session) + if (session.metricsActive) { + recordExecSessionActive(-1) + session.metricsActive = false + } + } + this.execSessionManager.detachLocalExecSession(sessionId) + await this.relayTransport.cleanup(sessionId) + .catch((error) => { + logger.warn('[RELAY] Failed to cleanup exec relay bridge during local detach', { + sessionId, + error: error.message + }) + }) + } + + async _detachLogSessionLocal (sessionId) { + const session = this.logSessionManager.getLogSession(sessionId) + if (session) { + this._clearPendingPairingTimer(session) + if (session.metricsActive) { + recordLogSessionActive(-1) + session.metricsActive = false + } + } + this.logBackpressureNotified.delete(sessionId) + this.logSessionManager.detachLocalLogSession(sessionId) + await this.relayTransport.cleanupLogSession(sessionId) + .catch((error) => { + logger.warn('[RELAY] Failed to cleanup log relay bridge during local detach', { + sessionId, + error: error.message + }) }) + logger.info('Log session local detach complete:' + JSON.stringify({ + sessionId, + microserviceUuid: session ? session.microserviceUuid || null : null, + fogUuid: session ? session.fogUuid || null : null + })) + } - processErrorHandlersRegistered = true + async _handleAgentExecPartialDisconnect (sessionId, currentSession, fog) { + currentSession.agent = null + currentSession.activationSent = false + currentSession.remoteAgentPaired = false + currentSession.remoteUserPaired = false + currentSession.lastActivity = Date.now() + + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + await MicroserviceExecSessionManager.update( + { sessionId }, + { agentConnected: false }, + closeTransaction + ) + await ChangeTrackingService.update( + fog.uuid, + ChangeTrackingService.events.microserviceExecSessions, + closeTransaction + ) + })() + + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) + if (relayEnabled) { + try { + const closeMsg = { + type: MESSAGE_TYPES.CLOSE, + execId: sessionId, + sessionId, + microserviceUuid: currentSession.microserviceUuid, + timestamp: Date.now(), + data: Buffer.from('Agent closed connection') + } + const encoded = this.encodeMessage(closeMsg) + await this.relayTransport.publishToUser(sessionId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) + } catch (error) { + logger.error('[WS-CLOSE] Failed to send CLOSE to user via relay after agent exec disconnect', { + sessionId, + error: error.message + }) + } + } else if (currentSession.user && currentSession.user.readyState === WebSocket.OPEN) { + currentSession.user.close(1000, 'Agent closed connection') } - } - getLogConcurrencyLimit () { - return this.sessionConfig.logMaxConcurrentPerResource || 3 + if (!currentSession.user) { + await this._detachExecSessionLocal(sessionId) + } } - getExecConcurrencyLimit () { - return this.sessionConfig.execMaxConcurrentPerResource || 3 - } + async _handleUserExecPartialDisconnect (sessionId, currentSession, microserviceUuid) { + currentSession.user = null + currentSession.remoteAgentPaired = false + currentSession.lastActivity = Date.now() - getLogTailMaxLines () { - return this.sessionConfig.logTailMaxLines || 5000 - } + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + await MicroserviceExecSessionManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + const microservice = await MicroserviceManager.findOne( + { uuid: microserviceUuid }, + closeTransaction + ) + if (microservice) { + const fog = await FogManager.findOne({ uuid: microservice.iofogUuid }, closeTransaction) + if (fog) { + await ChangeTrackingService.update( + fog.uuid, + ChangeTrackingService.events.microserviceExecSessions, + closeTransaction + ) + } + } + })() - getExecPendingTimeoutMs () { - return this.sessionConfig.execPendingTimeoutMs || 60000 - } + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) + if (relayEnabled) { + try { + const closeMsg = { + type: MESSAGE_TYPES.CLOSE, + execId: sessionId, + sessionId, + microserviceUuid: currentSession.microserviceUuid, + timestamp: Date.now(), + data: Buffer.from('User closed connection') + } + const encoded = this.encodeMessage(closeMsg) + await this.relayTransport.publishToAgent(sessionId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) + } catch (error) { + logger.error('[WS-CLOSE] Failed to send CLOSE to agent via relay after user exec disconnect', { + sessionId, + error: error.message + }) + } + } else if (currentSession.agent && currentSession.agent.readyState === WebSocket.OPEN) { + currentSession.agent.close(1000, 'User closed connection') + } - getLogPendingTimeoutMs () { - return this.sessionConfig.logPendingTimeoutMs || 120000 + if (!currentSession.agent) { + await this._detachExecSessionLocal(sessionId) + } } - getDrainTimeoutMs () { - return this.sessionConfig.drainTimeoutMs || 30000 - } + async _handleAgentLogPartialDisconnect (sessionId, session, { microserviceUuid, iofogUuid, logStatus }) { + session.agent = null + session.remoteAgentPaired = false + session.remoteUserPaired = false + session.lastActivity = Date.now() - isCrossReplicaSession (session) { - return !!(session && (!session.agent || !session.user)) - } + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + if (microserviceUuid) { + await MicroserviceLogStatusManager.update( + { sessionId }, + { agentConnected: false }, + closeTransaction + ) + } else if (iofogUuid) { + await FogLogStatusManager.update( + { sessionId }, + { agentConnected: false }, + closeTransaction + ) + } - async requireRelayForCrossReplica (ws) { - if (this.haConfig.failFastOnRouterUnavailable === false) { - return true - } - const available = await this.relayTransport.isAvailable() - if (!available) { - logger.warn('[RELAY] Relay backend unavailable for cross-replica session', { - transport: this.relayTransport.getTransport() - }) - if (ws && ws.readyState === WebSocket.OPEN) { - ws.close(RELAY_UNAVAILABLE_CLOSE_CODE, RELAY_UNAVAILABLE_CLOSE_REASON) + let fogUuidForTracking = iofogUuid || logStatus.iofogUuid + if (!fogUuidForTracking && logStatus.microserviceUuid) { + const microservice = await MicroserviceManager.findOne( + { uuid: logStatus.microserviceUuid }, + closeTransaction + ) + fogUuidForTracking = microservice ? microservice.iofogUuid : null } - return false - } - return true - } - _scheduleRelaySetupAfterCommit (label, setupFn) { - setImmediate(async () => { - try { - await setupFn() - } catch (error) { - logger.error(`Failed to ${label}:` + JSON.stringify({ - error: error.message, - stack: error.stack - })) + if (fogUuidForTracking) { + await ChangeTrackingService.update( + fogUuidForTracking, + iofogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, + closeTransaction + ) } - }) - } + })() - async _cleanupLogSessionInTransaction (sessionId) { - await TransactionDecorator.generateTransaction(async (transaction) => { - await this.cleanupLogSession(sessionId, transaction) - }, { label: 'ws.log.cleanup' })() - } + const relayEnabled = this.relayTransport.shouldUseRelayForLogs(sessionId) + if (relayEnabled) { + await this._notifyLogUserViaRelay(sessionId, { + microserviceUuid: session.microserviceUuid, + fogUuid: session.fogUuid, + message: LOG_AGENT_DISCONNECTED_NOTICE + }) + } - async _cleanupExecSessionInTransaction (sessionId) { - await TransactionDecorator.generateTransaction(async (transaction) => { - await this.cleanupExecSession(sessionId, transaction) - }, { label: 'ws.exec.cleanup' })() + if (!session.user) { + await this._detachLogSessionLocal(sessionId) + } } async countLogSessionsInDb (microserviceUuid, fogUuid, transaction) { @@ -890,6 +1707,7 @@ class WebSocketServer { ) execSession.metricsActive = true recordExecSessionActive(1) + this._startExecPendingPairingMetrics(execSession) const activationMsg = { type: MESSAGE_TYPES.ACTIVATION, @@ -927,41 +1745,7 @@ class WebSocketServer { () => this.setupExecMessageForwarding(sessionId) ) - const EXEC_PENDING_TIMEOUT = this.getExecPendingTimeoutMs() - const pendingTimer = setTimeout(async () => { - const session = this.execSessionManager.getExecSession(sessionId) - if (!session || session.agent) { - return - } - logger.warn('Exec session pending timeout:' + JSON.stringify({ - sessionId, - microserviceUuid, - timeout: EXEC_PENDING_TIMEOUT - })) - try { - if (ws.readyState === WebSocket.OPEN) { - const timeoutMsg = { - type: MESSAGE_TYPES.STDERR, - data: Buffer.from('Timeout waiting for agent connection.\n'), - sessionId, - microserviceUuid, - execId: sessionId, - timestamp: Date.now() - } - ws.send(this.encodeMessage(timeoutMsg), { binary: true }) - ws.close(1008, 'Timeout waiting for agent connection') - } - } catch (error) { - logger.warn('Failed to close exec session on pending timeout:' + error.message) - } - try { - await TransactionDecorator.generateTransaction(async (timeoutTransaction) => { - await this.cleanupExecSession(sessionId, timeoutTransaction) - })() - } catch (error) { - logger.error('Failed to remove exec session after pending timeout:' + error.message) - } - }, EXEC_PENDING_TIMEOUT) + this._scheduleExecPendingPairingTimeout(sessionId, ws, microserviceUuid) setImmediate(async () => { try { @@ -983,16 +1767,36 @@ class WebSocketServer { }) ws.on('close', async (code, reason) => { - clearTimeout(pendingTimer) const session = this.execSessionManager.getExecSession(sessionId) if (session) { + this._clearPendingPairingTimer(session) + if (session.pairingMetricsStarted && !session.pairingCompleted) { + this._abortPendingPairingMetrics(session) + } session.user = null session.lastActivity = Date.now() try { - await TransactionDecorator.generateTransaction(async (closeTransaction) => { - await this.cleanupExecSession(sessionId, closeTransaction) - })() + let agentStillConnected = session.agent != null || session.remoteAgentPaired + if (!agentStillConnected) { + try { + agentStillConnected = await transactionRunner.runInTransaction( + (tx) => this._checkExecAgentPairedInDb(sessionId, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.exec.user-disconnect-db-check' } + ) + } catch (error) { + logger.warn('Exec user disconnect DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + } + + if (agentStillConnected) { + await this._handleUserExecPartialDisconnect(sessionId, session, microserviceUuid) + } else { + await this._cleanupExecSessionInTransaction(sessionId) + } } catch (err) { logger.error('Failed to cleanup exec session on user disconnect:' + JSON.stringify({ error: err.message, @@ -1086,25 +1890,6 @@ class WebSocketServer { () => this.setupExecMessageForwarding(sessionId) ) - if (session.user && session.user.readyState === WebSocket.OPEN) { - try { - const readyMsg = { - type: MESSAGE_TYPES.STDERR, - data: Buffer.from('Agent connected. Interactive exec is ready.\n'), - sessionId, - microserviceUuid, - execId: sessionId, - timestamp: Date.now() - } - session.user.send(this.encodeMessage(readyMsg), { binary: true }) - } catch (error) { - logger.warn('Failed to notify user that exec agent connected:' + JSON.stringify({ - sessionId, - error: error.message - })) - } - } - this.scheduleAgentExecConnectEvent(req, microserviceUuid) setImmediate(async () => { @@ -1141,51 +1926,13 @@ class WebSocketServer { ws.on('close', async (code, reason) => { const currentSession = this.execSessionManager.getExecSession(sessionId) if (currentSession) { - currentSession.agent = null - currentSession.lastActivity = Date.now() - + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) try { - await TransactionDecorator.generateTransaction(async (closeTransaction) => { - await MicroserviceExecSessionManager.update( - { sessionId }, - { agentConnected: false }, - closeTransaction - ) - - const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) - - if (!currentSession.user) { - await this.cleanupExecSession(sessionId, closeTransaction) - } else { - if (relayEnabled) { - try { - const closeMsg = { - type: MESSAGE_TYPES.CLOSE, - execId: sessionId, - sessionId, - microserviceUuid: currentSession.microserviceUuid, - timestamp: Date.now(), - data: Buffer.from('Agent closed connection') - } - const encoded = this.encodeMessage(closeMsg) - await this.relayTransport.publishToUser(sessionId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) - } catch (error) { - logger.error('[WS-CLOSE] Failed to send CLOSE to user via queue after agent exec disconnect', { - sessionId, - error: error.message - }) - } - } else if (currentSession.user.readyState === WebSocket.OPEN) { - currentSession.user.close(1000, 'Agent closed connection') - } - - await ChangeTrackingService.update( - fog.uuid, - ChangeTrackingService.events.microserviceExecSessions, - closeTransaction - ) - } - })() + if (currentSession.user != null || relayEnabled) { + await this._handleAgentExecPartialDisconnect(sessionId, currentSession, fog) + } else { + await this._cleanupExecSessionInTransaction(sessionId) + } } catch (err) { logger.error('Failed to handle agent exec disconnect:' + JSON.stringify({ sessionId, @@ -1254,13 +2001,16 @@ class WebSocketServer { // } async sendExecActivationToExecSession (session, sessionId) { - if (!session.user || !session.agent) { - return false - } if (session.activationSent) { return true } + const hasLocalAgent = session.agent && session.agent.readyState === WebSocket.OPEN + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) + if (!hasLocalAgent && !relayEnabled) { + return false + } + const activationMsg = { type: MESSAGE_TYPES.ACTIVATION, data: Buffer.from(JSON.stringify({ @@ -1469,9 +2219,7 @@ class WebSocketServer { for (const sessionId of execSessionIds) { cleanupTasks.push( - TransactionDecorator.generateTransaction(async (tx) => { - await this.cleanupExecSession(sessionId, tx) - })().catch((error) => { + this._cleanupExecSessionInTransaction(sessionId).catch((error) => { logger.warn('[WS-DRAIN] Exec session cleanup failed', { sessionId, error: error.message }) }) ) @@ -1479,9 +2227,7 @@ class WebSocketServer { for (const sessionId of logSessionIds) { cleanupTasks.push( - TransactionDecorator.generateTransaction(async (tx) => { - await this.cleanupLogSession(sessionId, tx) - })().catch((error) => { + this._cleanupLogSessionInTransaction(sessionId).catch((error) => { logger.warn('[WS-DRAIN] Log session cleanup failed', { sessionId, error: error.message }) }) ) @@ -1919,6 +2665,7 @@ class WebSocketServer { ) logSession.metricsActive = true recordLogSessionActive(1) + this._startLogPendingPairingMetrics(logSession) // 7. Send sessionId to user (MessagePack encoded) const sessionInfoMsg = { @@ -1961,43 +2708,7 @@ class WebSocketServer { () => this.setupLogMessageForwarding(sessionId) ) - // Pending timeout: close if agent does not connect within logPendingTimeoutMs - const LOG_PENDING_TIMEOUT = this.getLogPendingTimeoutMs() - const pendingTimer = setTimeout(async () => { - const session = this.logSessionManager.getLogSession(sessionId) - if (!session || session.agent) { - return - } - logger.warn('Log session pending timeout:' + JSON.stringify({ - sessionId, - microserviceUuid, - fogUuid, - timeout: LOG_PENDING_TIMEOUT - })) - try { - if (ws.readyState === WebSocket.OPEN) { - const timeoutMsg = { - type: MESSAGE_TYPES.LOG_LINE, - data: Buffer.from('Timeout waiting for agent connection.\n'), - sessionId, - timestamp: Date.now(), - microserviceUuid: microserviceUuid || null, - iofogUuid: fogUuid || null - } - ws.send(this.encodeMessage(timeoutMsg), { binary: true }) - ws.close(1008, 'Timeout waiting for agent connection') - } - } catch (error) { - logger.warn('Failed to close log session on pending timeout:' + error.message) - } - try { - await TransactionDecorator.generateTransaction(async (timeoutTransaction) => { - await this.logSessionManager.removeLogSession(sessionId, timeoutTransaction) - })() - } catch (error) { - logger.error('Failed to remove log session after pending timeout:' + error.message) - } - }, LOG_PENDING_TIMEOUT) + this._scheduleLogPendingPairingTimeout(sessionId, ws, microserviceUuid, fogUuid) // 10. Record WebSocket connection event (non-blocking) setImmediate(async () => { @@ -2022,31 +2733,48 @@ class WebSocketServer { // Handle user disconnect ws.on('close', async (code, reason) => { - clearTimeout(pendingTimer) const session = this.logSessionManager.getLogSession(sessionId) if (session) { + this._clearPendingPairingTimer(session) + if (session.pairingMetricsStarted && !session.pairingCompleted) { + this._abortPendingPairingMetrics(session) + } + const agentStillConnected = session.agent != null || session.remoteAgentPaired session.user = null session.lastActivity = Date.now() try { - await TransactionDecorator.generateTransaction(async (closeTransaction) => { - if (microserviceUuid) { - await MicroserviceLogStatusManager.update( - { sessionId }, - { userConnected: false }, - closeTransaction - ) - } else if (fogUuid) { - await FogLogStatusManager.update( - { sessionId }, - { userConnected: false }, - closeTransaction + let agentConnected = agentStillConnected + if (!agentConnected) { + try { + agentConnected = await transactionRunner.runInTransaction( + (tx) => this._checkLogAgentPairedInDb(sessionId, microserviceUuid, fogUuid, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.log.user-disconnect-db-check' } ) + } catch (error) { + logger.warn('Log user disconnect DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) } + } + + if (agentConnected) { + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + if (microserviceUuid) { + await MicroserviceLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } else if (fogUuid) { + await FogLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } - if (!session.agent) { - await this.logSessionManager.removeLogSession(sessionId, closeTransaction) - } else { const fogForTracking = await FogManager.findOne({ uuid: fogUuid || (await MicroserviceManager.findOne({ uuid: microserviceUuid }, closeTransaction)).iofogUuid }, closeTransaction) @@ -2055,8 +2783,26 @@ class WebSocketServer { fogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, closeTransaction ) + })() + logger.info('Log session user disconnected (agent still connected):' + JSON.stringify({ + sessionId, + microserviceUuid: microserviceUuid || null, + fogUuid: fogUuid || null, + closeCode: code + })) + session.remoteAgentPaired = false + if (!session.agent) { + await this._detachLogSessionLocal(sessionId) } - })() + } else { + logger.info('Log session user disconnected (full cleanup):' + JSON.stringify({ + sessionId, + microserviceUuid: microserviceUuid || null, + fogUuid: fogUuid || null, + closeCode: code + })) + await this._cleanupLogSessionInTransaction(sessionId) + } } catch (err) { logger.error('Failed to cleanup log session on user disconnect:' + JSON.stringify({ error: err.message, @@ -2236,30 +2982,9 @@ class WebSocketServer { } ws.send(this.encodeMessage(configMsg), { binary: true }) - // 7. Notify user that agent has connected and streaming has started + // 7. Notify user when agent connects (same-replica or relay in setupLogMessageForwarding) if (session.user && session.user.readyState === WebSocket.OPEN) { - try { - const agentConnectedMsg = { - type: MESSAGE_TYPES.LOG_START, - data: Buffer.from(JSON.stringify({ - sessionId, - message: 'Agent connected. Log streaming started.\n' - })), - sessionId, - timestamp: Date.now() - } - session.user.send(this.encodeMessage(agentConnectedMsg), { binary: true }) - logger.info('Notified user that agent connected for log session:' + JSON.stringify({ - sessionId, - microserviceUuid: logStatus.microserviceUuid, - iofogUuid: logStatus.iofogUuid - })) - } catch (error) { - logger.warn('Failed to notify user that agent connected:' + JSON.stringify({ - error: error.message, - sessionId - })) - } + this._markLogAgentPaired(sessionId, { notifyUser: true, source: 'same-replica' }) } // 8. Relay setup after DB transaction commits (NATS hub lookup uses background writes). @@ -2305,38 +3030,33 @@ class WebSocketServer { ws.on('close', async (code, reason) => { const session = this.logSessionManager.getLogSession(sessionId) if (session) { - session.agent = null - session.lastActivity = Date.now() + const relayEnabled = this.relayTransport.shouldUseRelayForLogs(sessionId) + const partialDisconnect = session.user != null || relayEnabled try { - await TransactionDecorator.generateTransaction(async (closeTransaction) => { - if (microserviceUuid) { - await MicroserviceLogStatusManager.update( - { sessionId }, - { agentConnected: false }, - closeTransaction - ) - } else if (iofogUuid) { - await FogLogStatusManager.update( - { sessionId }, - { agentConnected: false }, - closeTransaction - ) - } - - if (!session.user) { - await this.logSessionManager.removeLogSession(sessionId, closeTransaction) - } else { - const fog = await FogManager.findOne({ - uuid: iofogUuid || logStatus.iofogUuid || (await MicroserviceManager.findOne({ uuid: logStatus.microserviceUuid }, closeTransaction)).iofogUuid - }, closeTransaction) - await ChangeTrackingService.update( - fog.uuid, - iofogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, - closeTransaction - ) - } - })() + if (partialDisconnect) { + logger.info('Log session agent disconnected (partial detach):' + JSON.stringify({ + sessionId, + microserviceUuid: microserviceUuid || null, + fogUuid: iofogUuid || null, + userConnected: session.user != null, + relayEnabled, + closeCode: code + })) + await this._handleAgentLogPartialDisconnect(sessionId, session, { + microserviceUuid, + iofogUuid, + logStatus + }) + } else { + logger.info('Log session agent disconnected (full cleanup):' + JSON.stringify({ + sessionId, + microserviceUuid: microserviceUuid || null, + fogUuid: iofogUuid || null, + closeCode: code + })) + await this._cleanupLogSessionInTransaction(sessionId) + } } catch (err) { logger.error('Failed to cleanup log session on agent disconnect:' + JSON.stringify({ error: err.message, @@ -2399,6 +3119,24 @@ class WebSocketServer { this._cleanupLogSessionInTransaction(closedSessionId) }) + const relayEnabled = this.relayTransport.shouldUseRelayForLogs(sessionId) + if (session.user && !session.agent && relayEnabled) { + this._registerLogUserRelayPairingHook(sessionId) + } + if (session.agent && !session.user && relayEnabled) { + const notified = await this._notifyLogUserViaRelay(sessionId, { + microserviceUuid: session.microserviceUuid, + fogUuid: session.fogUuid, + message: LOG_AGENT_READY_NOTICE + }) + if (notified) { + this._markLogUserPaired(sessionId, { source: 'relay-notify' }) + } + } + if (session.user && session.agent) { + this._markLogAgentPaired(sessionId, { notifyUser: false, source: 'same-replica-setup' }) + } + // ONLY agent → user forwarding (unidirectional, one-to-one) // All messages from agent are MessagePack encoded (binary) if (session.agent) { @@ -2440,6 +3178,8 @@ class WebSocketServer { dataLength: msg.data ? msg.data.length : 0 })) + session.lastActivity = Date.now() + if (msg.type === MESSAGE_TYPES.LOG_LINE) { // Forward to user (one-to-one, like exec sessions) await this.forwardLogToUser(sessionId, buffer) @@ -2498,6 +3238,8 @@ class WebSocketServer { return } + session.lastActivity = Date.now() + // Buffer is already MessagePack encoded from agent // Following exec session pattern: Use queue for ALL scenarios (single and multi-replica) // One-to-one forwarding (agent → user) via queue @@ -2550,12 +3292,20 @@ class WebSocketServer { async cleanupLogSession (sessionId, transaction) { const session = this.logSessionManager.getLogSession(sessionId) + if (session) { + this._clearPendingPairingTimer(session) + } if (session && session.metricsActive) { recordLogSessionActive(-1) } this.logBackpressureNotified.delete(sessionId) await this.logSessionManager.removeLogSession(sessionId, transaction) await this.relayTransport.cleanupLogSession(sessionId) + logger.info('Log session cleanup complete:' + JSON.stringify({ + sessionId, + microserviceUuid: session ? session.microserviceUuid || null : null, + fogUuid: session ? session.fogUuid || null : null + })) } async setupExecMessageForwarding (sessionId) { @@ -2612,6 +3362,28 @@ class WebSocketServer { }) } + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) + + if (user && !agent && relayEnabled) { + this._registerExecUserRelayPairingHook(sessionId) + } + + if (agent && !user && relayEnabled) { + this._registerExecAgentRelayActivityHook(sessionId) + const activated = await this._sendExecActivationViaRelay(sessionId, session.microserviceUuid) + if (!activated) { + logger.error('[RELAY] Cross-replica exec activation failed on agent pod', { + sessionId, + microserviceUuid: session.microserviceUuid + }) + return + } + const notified = await this._notifyExecUserViaRelay(sessionId, session.microserviceUuid) + if (notified) { + this._markExecUserPaired(sessionId, { source: 'relay-notify' }) + } + } + if (user && agent) { const activated = await this.sendExecActivationToExecSession(session, sessionId) if (!activated) { @@ -2621,6 +3393,7 @@ class WebSocketServer { }) return } + this._markExecAgentPaired(sessionId, { notifyUser: true, source: 'same-replica' }) } if (user) { @@ -2646,6 +3419,8 @@ class WebSocketServer { if (!sent && this.relayTransport.shouldUseRelay(execId)) { logger.error('[RELAY] Exec relay publish failed; closing session', { sessionId: execId }) await this._cleanupExecSessionInTransaction(execId) + } else { + session.lastActivity = Date.now() } return } @@ -2709,6 +3484,8 @@ class WebSocketServer { if (!sent && this.relayTransport.shouldUseRelay(execId)) { logger.error('[RELAY] Exec relay publish failed; closing session', { sessionId: execId }) await this._cleanupExecSessionInTransaction(execId) + } else { + session.lastActivity = Date.now() } } catch (error) { logger.error('[RELAY] Failed to process exec user message:' + JSON.stringify({ @@ -2752,6 +3529,7 @@ class WebSocketServer { if (relayEnabled) { try { await this.relayTransport.publishToUser(execId, buffer) + session.lastActivity = Date.now() } catch (error) { logger.error('[RELAY] Exec relay publish to user failed; closing session', { sessionId: execId, @@ -2771,6 +3549,7 @@ class WebSocketServer { timestamp: Date.now() } session.user.send(this.encodeMessage(userMsg), { binary: true }) + session.lastActivity = Date.now() } } else if (msg.type === MESSAGE_TYPES.CONTROL) { session.user.send(data, { binary: true }) @@ -2795,6 +3574,9 @@ class WebSocketServer { async cleanupExecSession (sessionId, transaction) { const session = this.execSessionManager.getExecSession(sessionId) + if (session) { + this._clearPendingPairingTimer(session) + } if (session && session.metricsActive) { recordExecSessionActive(-1) session.metricsActive = false @@ -2825,6 +3607,8 @@ class WebSocketServer { } } + await this._notifyExecRemotePeerClose(sessionId, session, 'Session closed') + await this.execSessionManager.removeExecSession(sessionId, transaction) await this.relayTransport.cleanup(sessionId) .catch(error => { diff --git a/test/src/services/nats-relay-transport.test.js b/test/src/services/nats-relay-transport.test.js index 2c872ad1..638c5c3d 100644 --- a/test/src/services/nats-relay-transport.test.js +++ b/test/src/services/nats-relay-transport.test.js @@ -2,6 +2,7 @@ const { expect } = require('chai') const sinon = require('sinon') const msgpack = require('@msgpack/msgpack') const WebSocket = require('ws') +const { headers: natsHeaders } = require('@nats-io/transport-node') const { NatsRelayTransportImpl, @@ -214,4 +215,56 @@ describe('NatsRelayTransportImpl', () => { await expect(slowTransport.publishToAgent(execId, Buffer.from('x'))) .to.be.rejectedWith(/not flushed within 20ms/) }) + + it('CLOSE relay closes open socket without invoking cleanupCallback', async () => { + const userWs = createMockWebSocket() + const cleanup = sinon.stub().resolves() + + await transport.enableForSession({ execId, user: userWs }, cleanup) + + const hdrs = natsHeaders() + hdrs.set('messageType', '4') + nc.publish(execUserSubject(execId), Buffer.from(''), { headers: hdrs }) + + await new Promise((resolve) => setImmediate(resolve)) + await new Promise((resolve) => setImmediate(resolve)) + + expect(userWs.close).to.have.been.calledOnce + expect(cleanup).to.not.have.been.called + }) + + it('CLOSE ack relay does not invoke cleanupCallback', async () => { + const userWs = createMockWebSocket() + const cleanup = sinon.stub().resolves() + + await transport.enableForSession({ execId, user: userWs }, cleanup) + + const hdrs = natsHeaders() + hdrs.set('messageType', '4') + hdrs.set('closeAck', 'true') + nc.publish(execUserSubject(execId), Buffer.from(''), { headers: hdrs }) + + await new Promise((resolve) => setImmediate(resolve)) + await new Promise((resolve) => setImmediate(resolve)) + + expect(cleanup).to.not.have.been.called + expect(userWs.close).to.not.have.been.called + }) + + it('CLOSE relay invokes cleanupCallback when socket is already closed', async () => { + const userWs = createMockWebSocket() + userWs.readyState = WebSocket.CLOSED + const cleanup = sinon.stub().resolves() + + await transport.enableForSession({ execId, user: userWs }, cleanup) + + const hdrs = natsHeaders() + hdrs.set('messageType', '4') + nc.publish(execUserSubject(execId), Buffer.from(''), { headers: hdrs }) + + await new Promise((resolve) => setImmediate(resolve)) + await new Promise((resolve) => setImmediate(resolve)) + + expect(cleanup).to.have.been.calledOnceWith(execId) + }) }) diff --git a/test/src/websocket/ws-cross-replica-split.test.js b/test/src/websocket/ws-cross-replica-split.test.js new file mode 100644 index 00000000..8409b6fc --- /dev/null +++ b/test/src/websocket/ws-cross-replica-split.test.js @@ -0,0 +1,443 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const WebSocketServerClass = require('../../../src/websocket/server') +const MicroserviceExecSessionManager = require('../../../src/data/managers/microservice-exec-session-manager') +const MicroserviceLogStatusManager = require('../../../src/data/managers/microservice-log-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const AppHelper = require('../../../src/helpers/app-helper') +const EventService = require('../../../src/services/event-service') +const { + MESSAGE_TYPES, + createMockWebSocket, + createMockRequest, + buildExecFrame, + decodeExecMessage, + createMockNatsRelayTransport, + resetWebSocketServerSingleton, + newTestIds, + waitForSent, + delay +} = require('../../support/ws-session-harness') +const { resetTransportForTests } = require('../../../src/services/ws-relay-transport-factory') + +function lastSent (ws) { + return ws._sentMessages[ws._sentMessages.length - 1].data +} + +function hasSentMessageType (ws, type) { + return ws._sentMessages.some((entry) => { + try { + return decodeExecMessage(entry.data).type === type + } catch (e) { + return false + } + }) +} + +function hasSentText (ws, needle) { + return ws._sentMessages.some((entry) => { + try { + const msg = decodeExecMessage(entry.data) + return msg.data && msg.data.toString().includes(needle) + } catch (e) { + return false + } + }) +} + +describe('WebSocket exec/log — split replica pairing', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + + let serverA + let serverB + let sharedRelay + let transaction + let execRow + + beforeEach(() => { + resetTransportForTests() + resetWebSocketServerSingleton(WebSocketServerClass) + + sharedRelay = createMockNatsRelayTransport() + serverA = new WebSocketServerClass() + serverB = new WebSocketServerClass() + serverA.relayTransport = sharedRelay + serverB.relayTransport = sharedRelay + serverA.sessionConfig.execPendingTimeoutMs = 500 + serverA.sessionConfig.logPendingTimeoutMs = 500 + serverB.sessionConfig.execPendingTimeoutMs = 500 + serverB.sessionConfig.logPendingTimeoutMs = 500 + + transaction = { fakeTransaction: true } + execRow = { + sessionId: $ids.sessionId, + microserviceUuid: $ids.microserviceUuid, + status: 'PENDING', + userConnected: true, + agentConnected: false + } + + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(EventService, 'createWsConnectEvent').resolves() + $sandbox.stub(EventService, 'createWsDisconnectEvent').resolves() + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(MicroserviceExecSessionManager, 'create').resolves() + $sandbox.stub(MicroserviceExecSessionManager, 'update').callsFake(async (_where, patch) => { + Object.assign(execRow, patch) + }) + $sandbox.stub(MicroserviceExecSessionManager, 'deleteBySessionId').resolves() + $sandbox.stub(MicroserviceExecSessionManager, 'findAll').resolves([]) + $sandbox.stub(MicroserviceExecSessionManager, 'findBySessionId').callsFake(async () => ({ ...execRow })) + $sandbox.stub(serverA, 'validateUserConnection').resolves({ uuid: $ids.microserviceUuid }) + $sandbox.stub(serverB, 'validateAgentExecConnection').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(AppHelper, 'generateUUID').returns($ids.sessionId) + $sandbox.stub(serverA, 'countExecSessionsInDb').resolves(0) + $sandbox.stub(serverB, 'countExecSessionsInDb').resolves(0) + }) + + afterEach(() => { + $sandbox.restore() + resetTransportForTests() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('relays ACTIVATION when each replica has its own relay bridge map', async () => { + serverA.relayTransport = createMockNatsRelayTransport() + serverB.relayTransport = createMockNatsRelayTransport() + + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(50) + + expect(userWs.readyState).to.equal(WebSocket.OPEN) + expect(hasSentMessageType(agentWs, MESSAGE_TYPES.ACTIVATION)).to.equal(true) + expect(serverB.relayTransport.shouldUseRelay($ids.sessionId)).to.equal(true) + }) + + it('keeps exec user open when agent connects on replica B and relays ACTIVATION', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(50) + + expect(userWs.readyState).to.equal(WebSocket.OPEN) + expect(hasSentMessageType(agentWs, MESSAGE_TYPES.ACTIVATION)).to.equal(true) + expect(hasSentText(userWs, 'Interactive exec is ready')).to.equal(true) + + const stdinFrame = buildExecFrame( + MESSAGE_TYPES.STDIN, + $ids.sessionId, + $ids.microserviceUuid, + 'echo hi\n' + ) + userWs.emit('message', stdinFrame, true) + await waitForSent(agentWs, 1) + + const agentReceived = decodeExecMessage(lastSent(agentWs)) + expect(agentReceived.type).to.equal(MESSAGE_TYPES.STDIN) + }) + + it('uses DB fallback for exec pending timeout when agentConnected is true', async () => { + execRow.agentConnected = true + const userWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + + await delay(700) + expect(userWs.readyState).to.equal(WebSocket.OPEN) + const session = serverA.execSessionManager.getExecSession($ids.sessionId) + expect(session.remoteAgentPaired).to.equal(true) + }) + + it('relays log agent-ready LOG_LINE to user on replica A', async () => { + const logRow = { + sessionId: $ids.sessionId, + microserviceUuid: $ids.microserviceUuid, + tailConfig: JSON.stringify({ lines: 100, follow: true, since: null, until: null }), + agentConnected: false, + userConnected: true + } + + $sandbox.stub(MicroserviceLogStatusManager, 'create').resolves() + $sandbox.stub(MicroserviceLogStatusManager, 'update').callsFake(async (_where, patch) => { + Object.assign(logRow, patch) + }) + $sandbox.stub(MicroserviceLogStatusManager, 'delete').resolves() + $sandbox.stub(MicroserviceLogStatusManager, 'findOne').callsFake(async () => ({ ...logRow })) + $sandbox.stub(serverA, 'validateUserLogsConnection').resolves() + $sandbox.stub(serverB, 'validateAgentLogsConnection').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(serverA, 'countLogSessionsInDb').resolves(0) + + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/logs/${$ids.microserviceUuid}?tail=100`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserLogsConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + null, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/logs/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentLogsConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + null, + $ids.sessionId, + transaction + ) + await delay(50) + + expect(userWs.readyState).to.equal(WebSocket.OPEN) + expect(hasSentMessageType(agentWs, MESSAGE_TYPES.LOG_START)).to.equal(true) + expect(hasSentText(userWs, 'Log streaming started')).to.equal(true) + expect(hasSentMessageType(userWs, MESSAGE_TYPES.LOG_LINE)).to.equal(true) + }) + + it('notifies user and preserves DB row when agent disconnects on replica B', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(50) + + execRow.agentConnected = true + MicroserviceExecSessionManager.deleteBySessionId.resetHistory() + + agentWs.close() + await delay(50) + + expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called + expect(execRow.agentConnected).to.equal(false) + expect(hasSentMessageType(userWs, MESSAGE_TYPES.CLOSE)).to.equal(true) + expect(serverB.execSessionManager.getExecSession($ids.sessionId)).to.equal(null) + expect(serverA.execSessionManager.getExecSession($ids.sessionId)).to.not.equal(null) + }) + + it('keeps paired exec user open past pending timeout window', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(100) + + expect(hasSentText(userWs, 'Interactive exec is ready')).to.equal(true) + + await delay(700) + expect(userWs.readyState).to.equal(WebSocket.OPEN) + }) + + it('keeps paired exec agent on replica B past pending timeout window', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(100) + + const agentSession = serverB.execSessionManager.getExecSession($ids.sessionId) + expect(agentSession.remoteUserPaired).to.equal(true) + + MicroserviceExecSessionManager.deleteBySessionId.resetHistory() + await delay(700) + + expect(agentWs.readyState).to.equal(WebSocket.OPEN) + expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called + expect(serverB.execSessionManager.getExecSession($ids.sessionId)).to.not.equal(null) + }) + + it('preserves DB row when user disconnects on replica A with agent on replica B', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const userReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + userReq.headers.authorization = 'Bearer user-jwt' + + await serverA.handleUserExecConnection( + userWs, + userReq, + 'Bearer user-jwt', + $ids.microserviceUuid, + false, + transaction + ) + await delay(50) + + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}`, + '127.0.0.2' + ) + agentReq.headers.authorization = 'Bearer fog-token' + await serverB.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(50) + + execRow.agentConnected = true + MicroserviceExecSessionManager.deleteBySessionId.resetHistory() + + userWs.close() + await delay(50) + + expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called + expect(execRow.userConnected).to.equal(false) + expect(execRow.agentConnected).to.equal(true) + expect(serverA.execSessionManager.getExecSession($ids.sessionId)).to.equal(null) + expect(serverB.execSessionManager.getExecSession($ids.sessionId)).to.not.equal(null) + expect(hasSentMessageType(agentWs, MESSAGE_TYPES.CLOSE)).to.equal(true) + }) +}) diff --git a/test/src/websocket/ws-session-cleanup.test.js b/test/src/websocket/ws-session-cleanup.test.js new file mode 100644 index 00000000..baca6799 --- /dev/null +++ b/test/src/websocket/ws-session-cleanup.test.js @@ -0,0 +1,73 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const WebSocketServerClass = require('../../../src/websocket/server') +const transactionRunner = require('../../../src/helpers/transaction-runner') +const { PRIORITY_BACKGROUND } = transactionRunner +const { resetWebSocketServerSingleton } = require('../../support/ws-session-harness') + +describe('WebSocket session cleanup dedupe', () => { + def('sandbox', () => sinon.createSandbox()) + + let wsServer + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + }) + + afterEach(() => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('dedupes concurrent exec cleanups for the same sessionId', async () => { + let resolveCleanup + const gate = new Promise((resolve) => { + resolveCleanup = resolve + }) + + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn, options) => { + expect(options).to.include({ priority: PRIORITY_BACKGROUND, label: 'ws.exec.cleanup' }) + await gate + return fn({ id: 'tx-exec' }) + }) + $sandbox.stub(wsServer, 'cleanupExecSession').resolves() + + const first = wsServer._cleanupExecSessionInTransaction('session-1') + const second = wsServer._cleanupExecSessionInTransaction('session-1') + + expect(transactionRunner.runInTransaction).to.have.been.calledOnce + + resolveCleanup() + await Promise.all([first, second]) + + expect(wsServer.cleanupExecSession).to.have.been.calledOnceWith('session-1', { id: 'tx-exec' }) + expect(wsServer._execCleanupInflight.has('session-1')).to.equal(false) + }) + + it('dedupes concurrent log cleanups for the same sessionId', async () => { + let resolveCleanup + const gate = new Promise((resolve) => { + resolveCleanup = resolve + }) + + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn, options) => { + expect(options).to.include({ priority: PRIORITY_BACKGROUND, label: 'ws.log.cleanup' }) + await gate + return fn({ id: 'tx-log' }) + }) + $sandbox.stub(wsServer, 'cleanupLogSession').resolves() + + const first = wsServer._cleanupLogSessionInTransaction('log-session-1') + const second = wsServer._cleanupLogSessionInTransaction('log-session-1') + + expect(transactionRunner.runInTransaction).to.have.been.calledOnce + + resolveCleanup() + await Promise.all([first, second]) + + expect(wsServer.cleanupLogSession).to.have.been.calledOnceWith('log-session-1', { id: 'tx-log' }) + expect(wsServer._logCleanupInflight.has('log-session-1')).to.equal(false) + }) +}) diff --git a/test/src/websocket/ws-session-expiry.test.js b/test/src/websocket/ws-session-expiry.test.js new file mode 100644 index 00000000..f11f726f --- /dev/null +++ b/test/src/websocket/ws-session-expiry.test.js @@ -0,0 +1,160 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ExecSessionManager = require('../../../src/websocket/exec-session-manager') +const LogSessionManager = require('../../../src/websocket/log-session-manager') +const MicroserviceExecSessionManager = require('../../../src/data/managers/microservice-exec-session-manager') +const MicroserviceLogStatusManager = require('../../../src/data/managers/microservice-log-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') + +describe('WebSocket session expiry — remoteAgentPaired / remoteUserPaired', () => { + def('sandbox', () => sinon.createSandbox()) + + beforeEach(() => { + $sandbox.stub(MicroserviceExecSessionManager, 'deleteBySessionId').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: 'fog-1' }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: 'fog-1' }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(MicroserviceLogStatusManager, 'delete').resolves() + }) + + afterEach(() => { + $sandbox.restore() + }) + + it('does not expire cross-replica paired exec user before maxDuration', async () => { + const manager = new ExecSessionManager({ + session: { + execPendingTimeoutMs: 60000, + execMaxDurationMs: 28800000, + cleanupInterval: 30000 + } + }) + manager.stopCleanupInterval() + + const now = Date.now() + manager.execSessions.set('exec-1', { + sessionId: 'exec-1', + microserviceUuid: 'ms-1', + user: { readyState: 1, close: () => {} }, + agent: null, + remoteAgentPaired: true, + createdAt: now - 120000, + lastActivity: now - 1000 + }) + + const expired = await manager.cleanupExpiredSessions({}) + expect(expired).to.equal(0) + expect(manager.execSessions.has('exec-1')).to.equal(true) + expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called + }) + + it('expires cross-replica paired exec user after maxDuration idle', async () => { + const manager = new ExecSessionManager({ + session: { + execPendingTimeoutMs: 60000, + execMaxDurationMs: 1000, + cleanupInterval: 30000 + } + }) + manager.stopCleanupInterval() + + const now = Date.now() + manager.execSessions.set('exec-1', { + sessionId: 'exec-1', + microserviceUuid: 'ms-1', + user: { readyState: 1, close: $sandbox.spy() }, + agent: null, + remoteAgentPaired: true, + createdAt: now - 5000, + lastActivity: now - 2000 + }) + + const expired = await manager.cleanupExpiredSessions({}) + expect(expired).to.equal(1) + expect(manager.execSessions.has('exec-1')).to.equal(false) + }) + + it('does not expire cross-replica paired log user before idleTimeout', async () => { + const manager = new LogSessionManager({ + session: { + logPendingTimeoutMs: 120000, + logIdleTimeoutMs: 7200000, + cleanupInterval: 30000 + } + }) + manager.stopCleanupInterval() + + const now = Date.now() + manager.logSessions.set('log-1', { + sessionId: 'log-1', + microserviceUuid: 'ms-1', + fogUuid: null, + user: { readyState: 1, close: () => {} }, + agent: null, + remoteAgentPaired: true, + createdAt: now - 180000, + lastActivity: now - 1000 + }) + + const expired = await manager.cleanupExpiredSessions({}) + expect(expired).to.equal(0) + expect(manager.logSessions.has('log-1')).to.equal(true) + }) + + it('does not expire cross-replica paired exec agent before maxDuration', async () => { + const manager = new ExecSessionManager({ + session: { + execPendingTimeoutMs: 60000, + execMaxDurationMs: 28800000, + cleanupInterval: 30000 + } + }) + manager.stopCleanupInterval() + + const now = Date.now() + manager.execSessions.set('exec-1', { + sessionId: 'exec-1', + microserviceUuid: 'ms-1', + user: null, + agent: { readyState: 1, close: () => {} }, + remoteUserPaired: true, + createdAt: now - 120000, + lastActivity: now - 1000 + }) + + const expired = await manager.cleanupExpiredSessions({}) + expect(expired).to.equal(0) + expect(manager.execSessions.has('exec-1')).to.equal(true) + expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called + }) + + it('does not expire cross-replica paired log agent before idleTimeout', async () => { + const manager = new LogSessionManager({ + session: { + logPendingTimeoutMs: 120000, + logIdleTimeoutMs: 7200000, + cleanupInterval: 30000 + } + }) + manager.stopCleanupInterval() + + const now = Date.now() + manager.logSessions.set('log-1', { + sessionId: 'log-1', + microserviceUuid: 'ms-1', + fogUuid: null, + user: null, + agent: { readyState: 1, close: () => {} }, + remoteUserPaired: true, + createdAt: now - 180000, + lastActivity: now - 1000 + }) + + const expired = await manager.cleanupExpiredSessions({}) + expect(expired).to.equal(0) + expect(manager.logSessions.has('log-1')).to.equal(true) + }) +}) diff --git a/test/support/ws-session-harness.js b/test/support/ws-session-harness.js index b2e0e26e..adf209bc 100644 --- a/test/support/ws-session-harness.js +++ b/test/support/ws-session-harness.js @@ -85,6 +85,37 @@ function buildExecFrame (type, execId, microserviceUuid, data) { }) } +function mergeExecBridgeSession (existing, incoming) { + if (!existing) { + return incoming + } + return { + ...existing, + ...incoming, + execId: incoming.execId || existing.execId, + sessionId: incoming.sessionId || existing.sessionId, + microserviceUuid: incoming.microserviceUuid || existing.microserviceUuid, + user: incoming.user || existing.user, + agent: incoming.agent || existing.agent + } +} + +function mergeLogBridgeSession (existing, incoming) { + if (!existing) { + return incoming + } + return { + ...existing, + ...incoming, + sessionId: incoming.sessionId || existing.sessionId, + microserviceUuid: incoming.microserviceUuid || existing.microserviceUuid, + fogUuid: incoming.fogUuid || existing.fogUuid, + user: incoming.user || existing.user, + agent: incoming.agent || existing.agent, + tailConfig: incoming.tailConfig || existing.tailConfig + } +} + /** * In-memory relay stub for cross-replica exec/log tests. * @param {'amqp'|'nats'} [transport='amqp'] @@ -110,7 +141,7 @@ function createMockRelayTransport (transport = 'amqp') { if (!execId) return false const existing = execBridges.get(execId) if (existing) { - existing.session = session + existing.session = mergeExecBridgeSession(existing.session, session) if (cleanupCallback) { existing.cleanupCallback = cleanupCallback } @@ -124,10 +155,34 @@ function createMockRelayTransport (transport = 'amqp') { return execBridges.has(execId) }, + setExecUserDeliveryHook (execId, hook) { + const bridge = execBridges.get(execId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + }, + + setExecAgentDeliveryHook (execId, hook) { + const bridge = execBridges.get(execId) + if (bridge) { + bridge.onAgentRelayDelivery = hook + } + }, + + setLogUserDeliveryHook (sessionId, hook) { + const bridge = logBridges.get(sessionId) + if (bridge) { + bridge.onUserRelayDelivery = hook + } + }, + async publishToAgent (execId, buffer) { const bridge = execBridges.get(execId) if (bridge && bridge.session.agent && bridge.session.agent.readyState === WebSocket.OPEN) { bridge.session.agent.send(buffer, { binary: true }) + if (bridge.onAgentRelayDelivery) { + bridge.onAgentRelayDelivery(buffer) + } } }, @@ -135,6 +190,9 @@ function createMockRelayTransport (transport = 'amqp') { const bridge = execBridges.get(execId) if (bridge && bridge.session.user && bridge.session.user.readyState === WebSocket.OPEN) { bridge.session.user.send(buffer, { binary: true }) + if (bridge.onUserRelayDelivery) { + bridge.onUserRelayDelivery(buffer) + } } }, @@ -144,7 +202,15 @@ function createMockRelayTransport (transport = 'amqp') { async enableForLogSession (session, cleanupCallback) { const sessionId = session.sessionId - logBridges.set(sessionId, { session, cleanupCallback }) + const existing = logBridges.get(sessionId) + if (existing) { + existing.session = mergeLogBridgeSession(existing.session, session) + if (cleanupCallback) { + existing.cleanupCallback = cleanupCallback + } + } else { + logBridges.set(sessionId, { session, cleanupCallback }) + } return true }, @@ -155,7 +221,10 @@ function createMockRelayTransport (transport = 'amqp') { async publishLogToUser (sessionId, buffer) { const bridge = logBridges.get(sessionId) if (bridge && bridge.session.user && bridge.session.user.readyState === WebSocket.OPEN) { - bridge.session.user.emit('message', buffer, true) + bridge.session.user.send(buffer, { binary: true }) + if (bridge.onUserRelayDelivery) { + bridge.onUserRelayDelivery(buffer) + } } }, From 8d7b70683a92c7257ecb0ac93c31802f7e3f6145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 2 Jul 2026 10:41:06 +0300 Subject: [PATCH 16/32] Document WebSocket HA relay session fixes in the unreleased changelog. Covers cleanup race fixes, cross-replica pairing, activation ordering, and split-session lifecycle behavior. --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09a1b399..3cfc6032 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -124,6 +124,9 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **Agent fog-token auth hang (SQLite)** — `checkFogToken` updated `lastActive` via `FogManager.updateLastActive` without passing the open Sequelize transaction on a single-connection pool (`pool.max: 1`), deadlocking the write queue after provision when Edgelet first called JWT-authenticated routes (`PATCH /agent/config`, `GET /agent/registries`, etc.). - **WebSocket audit event logging (SQLite)** — `persistAuditEvent` (`PRIORITY_BACKGROUND`) reused a committed parent transaction from AsyncLocalStorage when `createWsConnectEvent` ran in `setImmediate` after the log-session handler committed, causing `commit has been called on this transaction` errors. Background `runInTransaction` on SQLite now always enqueues a fresh transaction. - **WebSocket log/exec session deadlock (SQLite)** — log and exec handlers awaited NATS relay setup inside the open interactive transaction; relay hub lookup enqueues a background transaction on the single SQLite connection and deadlocked. Relay setup now runs in `setImmediate` after DB work commits; relay cleanup callbacks open fresh transactions instead of capturing the handler transaction. +- **WebSocket exec/log session cleanup race (postgres / NATS relay)** — concurrent teardown paths (pending timeout + disconnect, NATS CLOSE + CLOSE ack, relay callback + `ws.on('close')`) reused one Sequelize transaction via AsyncLocalStorage, causing `commit has been called on this transaction` on session row delete. Exec and log cleanup are deduplicated per `sessionId`, use fresh background transactions, pending timeouts only close sockets, and relay CLOSE acks no longer trigger DB teardown. +- **WebSocket exec/log cross-replica pairing** — pending timeouts no longer require a local `session.agent`; user pods mark `remoteAgentPaired` via relay delivery hooks and DB fallback (`agentConnected`). Agent pods publish **ACTIVATION** (exec) and **LOG_LINE** user notifications via NATS/AMQP relay when the user is on another replica. Same-replica log “agent connected” notify uses **LOG_LINE** (not `LOG_START` + embedded message). **`ws_pending_pairings`** and **`ws_pairing_duration_ms`** metrics are recorded from user connect through pairing completion or timeout. Cross-replica paired sessions use **max/idle duration** (not pending timeout) in periodic cleanup; agent disconnect on an agent-only pod relays **CLOSE** (exec) or **LOG_LINE** (log) to the user pod and detaches local state without deleting the DB row. +- **WebSocket cross-replica exec activation** — `setupExecMessageForwarding` read `shouldUseRelay` before `enableForSession`, so agent-only pods skipped relay **ACTIVATION** and user notify on first connect (log setup was already correct). Info logs added for log session user/agent disconnect, full cleanup, and local detach. - **Volume mount manager transaction propagation** — `VolumeMountingManager.findOne` / `findAll` passed `transaction` as a second Sequelize argument instead of inside the options object, so NATS fog reconcile could create a volume mount in an open transaction then fail to link it (`nats-server-conf-* not found`). Reads now honor the parent transaction like `BaseManager`. - **Volume mount service transaction propagation** — `VolumeMountService.linkVolumeMountEndpoint` / `unlinkVolumeMountEndpoint` passed `transaction` as a second Sequelize argument to `getFogs` / `addVolumeMount` / `removeVolumeMount` instead of inside the options object, causing NATS fog reconcile to hang when linking volume mounts after auth bootstrap. - **Fog platform reconcile stale errors** — `reconcileFogPrepare` clears `lastError` when entering `Progressing` so prior `SQLITE_BUSY` does not mask current reconcile state. From 153fc6e510ba3a8aa0410bceb49dc0f762118e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:29 +0300 Subject: [PATCH 17/32] Run NATS auth reissue in post-commit background transactions. Defer rule and application orchestration until after API commit, enqueue resolver reconcile only after reissue completes, and gate microservice natsAccess patches on explicit field presence. --- src/data/managers/reconcile-outbox-manager.js | 4 + src/helpers/reconcile-outbox-keys.js | 18 +- src/helpers/transaction-runner.js | 16 +- src/services/application-service.js | 25 ++- src/services/microservices-service.js | 22 +-- src/services/nats-auth-service.js | 158 ++++++++++++++---- test/src/data/reconcile-outbox.test.js | 2 +- .../src/helpers/reconcile-outbox-keys.test.js | 20 ++- .../services/microservices-service.test.js | 40 +++++ .../services/nats-auth-orchestration.test.js | 33 ++++ 10 files changed, 280 insertions(+), 58 deletions(-) create mode 100644 test/src/services/nats-auth-orchestration.test.js diff --git a/src/data/managers/reconcile-outbox-manager.js b/src/data/managers/reconcile-outbox-manager.js index 5b156402..78fd21f3 100644 --- a/src/data/managers/reconcile-outbox-manager.js +++ b/src/data/managers/reconcile-outbox-manager.js @@ -34,6 +34,10 @@ class ReconcileOutboxManager extends BaseManager { async _resolveExistingEnqueue (existing, kind, serializedPayload, transaction) { if (existing.processedAt == null) { + if (existing.payload !== serializedPayload) { + await this.update({ id: existing.id }, { payload: serializedPayload }, transaction) + return this.findOne({ id: existing.id }, transaction) + } return existing } return this._reopenProcessedRow(existing, kind, serializedPayload, transaction) diff --git a/src/helpers/reconcile-outbox-keys.js b/src/helpers/reconcile-outbox-keys.js index 43f53467..e7a01ff5 100644 --- a/src/helpers/reconcile-outbox-keys.js +++ b/src/helpers/reconcile-outbox-keys.js @@ -21,15 +21,27 @@ function buildNatsIdempotencyKey (payload = {}) { applicationId, accountRuleId, userRuleId, - fogUuids + fogUuids, + microserviceUuid, + mutationKind, + authGeneration } = payload + const scopeSuffix = [ + applicationId ?? 'null', + accountRuleId ?? 'null', + userRuleId ?? 'null', + microserviceUuid ?? 'null', + mutationKind ?? 'null', + authGeneration ?? 'null' + ].join(':') + if (Array.isArray(fogUuids) && fogUuids.length > 0) { const sorted = [...fogUuids].sort().join(',') - return `nats:${reason}:${applicationId ?? 'null'}:${accountRuleId ?? 'null'}:${userRuleId ?? 'null'}:${sorted}` + return `nats:${reason}:${scopeSuffix}:${sorted}` } - return `nats:${reason}:${applicationId ?? 'null'}:${accountRuleId ?? 'null'}:${userRuleId ?? 'null'}` + return `nats:${reason}:${scopeSuffix}` } function buildIdempotencyKey (kind, payload = {}) { diff --git a/src/helpers/transaction-runner.js b/src/helpers/transaction-runner.js index 9005c533..a9717edf 100644 --- a/src/helpers/transaction-runner.js +++ b/src/helpers/transaction-runner.js @@ -216,6 +216,19 @@ async function runWithTransactionContext (transaction, priority, fn) { return activeTransactionStore.run({ transaction, priority: effectivePriority }, () => fn(transaction)) } +/** + * Defer work until after the current API tick so a committed ALS parent tx cannot + * be reused. Always runs fn inside a fresh PRIORITY_BACKGROUND transaction (R138). + * + * @param {string} label - transaction-runner label for metrics/logging + * @param {Function} fn - async (transaction) => result + */ +function schedulePostCommitBackground (label, fn) { + setImmediate(async () => { + await runInTransaction(fn, { priority: PRIORITY_BACKGROUND, label }) + }) +} + function _resetQueueForTests () { interactiveLane.length = 0 backgroundLane.length = 0 @@ -236,5 +249,6 @@ module.exports = { getWriteQueueMaxDepth, isSqliteProvider, runInTransaction, - runWithTransactionContext + runWithTransactionContext, + schedulePostCommitBackground } diff --git a/src/services/application-service.js b/src/services/application-service.js index 4a2af20e..c55254e9 100644 --- a/src/services/application-service.js +++ b/src/services/application-service.js @@ -15,21 +15,34 @@ const NatsAccountRuleManager = require('../data/managers/nats-account-rule-manag const NatsRuleJwtValidation = require('../helpers/nats-rule-jwt-validation') const NatsAuthService = require('./nats-auth-service') const logger = require('../logger') +const { schedulePostCommitBackground } = require('../helpers/transaction-runner') const onlyUnique = (value, index, self) => self.indexOf(value) === index function _scheduleApplicationNatsOrchestration (applicationId, reason) { - setImmediate(async () => { + schedulePostCommitBackground(`app-nats-orchestration-${applicationId}`, async (transaction) => { try { logger.info(`Starting background app NATS orchestration for app ${applicationId}: ${reason}`) + const suppressOutbox = { triggerReconcile: false } if (reason === 'nats-access-disabled') { - await MicroserviceService.reconcileNatsForApplication(applicationId) - await NatsAuthService.deleteAccountForApplication(applicationId) + await MicroserviceService.reconcileNatsForApplication(applicationId, transaction) + await NatsAuthService.deleteAccountForApplication(applicationId, transaction, suppressOutbox) } else { - await NatsAuthService.ensureAccountForApplication(applicationId) - await NatsAuthService.reissueAccountForApplication(applicationId) - await MicroserviceService.reconcileNatsForApplication(applicationId) + await NatsAuthService.ensureAccountForApplication(applicationId, transaction, suppressOutbox) + await NatsAuthService.reissueAccountForApplication(applicationId, transaction, suppressOutbox) + await MicroserviceService.reconcileNatsForApplication(applicationId, transaction) } + const mutationKind = reason === 'nats-access-disabled' + ? 'access-disable' + : reason === 'nats-access-enabled' + ? 'access-enable' + : 'rule-change' + const outboxReason = reason === 'nats-access-disabled' ? 'account-deleted' : 'account-created' + await NatsAuthService.enqueueNatsReconcileOutbox({ + reason: outboxReason, + applicationId, + mutationKind + }, transaction) logger.info(`Completed background app NATS orchestration for app ${applicationId}: ${reason}`) } catch (error) { logger.error(`Background app NATS orchestration failed for app ${applicationId}: ${error.message}`) diff --git a/src/services/microservices-service.js b/src/services/microservices-service.js index fa35524a..7245327e 100644 --- a/src/services/microservices-service.js +++ b/src/services/microservices-service.js @@ -1424,19 +1424,21 @@ async function updateMicroserviceEndPoint (microserviceUuid, microserviceData, i throw error } - const shouldEnableNats = microserviceData.natsAccess === true - const shouldDisableNats = microserviceData.natsAccess === false && microservice.natsAccess + const natsAccessInPatch = Object.prototype.hasOwnProperty.call(microserviceData, 'natsAccess') + const shouldEnableNats = natsAccessInPatch && microserviceData.natsAccess === true && !microservice.natsAccess + const shouldDisableNats = natsAccessInPatch && microserviceData.natsAccess === false && microservice.natsAccess const natsRuleChanged = Object.prototype.hasOwnProperty.call(microserviceData, 'natsRuleId') && microserviceData.natsRuleId !== microservice.natsRuleId - if (shouldEnableNats) { - if (natsRuleChanged) { - await NatsAuthService.reissueUserForMicroservice(updatedMicroservice.uuid, transaction) - } - await _ensureNatsCredsForMicroservice(updatedMicroservice, transaction) - } else if (shouldDisableNats) { + if (shouldDisableNats) { await _detachNatsCredsForMicroservice(microservice, transaction) await NatsAuthService.revokeMicroserviceUser(microservice.uuid, transaction) + } else if (microservice.natsAccess || shouldEnableNats) { + if (natsRuleChanged || shouldEnableNats) { + const mutationKind = shouldEnableNats ? 'access-enable' : 'rule-change' + await NatsAuthService.reissueUserForMicroservice(updatedMicroservice.uuid, transaction, { mutationKind }) + } + await _ensureNatsCredsForMicroservice(updatedMicroservice, transaction) } if (changeTrackingEnabled) { @@ -2703,15 +2705,15 @@ async function reconcileNatsForApplication (applicationId, transaction) { return } const microservices = await MicroserviceManager.findAll({ applicationId }, transaction) + const reconcileTriggerOptions = { triggerReconcile: false } for (const microservice of microservices) { if (!application.natsAccess || !microservice.natsAccess) { if (microservice.natsUserId || microservice.natsCredsSecretName || microservice.natsAccess) { - await NatsAuthService.revokeMicroserviceUser(microservice.uuid, transaction) + await NatsAuthService.revokeMicroserviceUser(microservice.uuid, transaction, reconcileTriggerOptions) await _detachNatsCredsForMicroservice(microservice, transaction) } continue } - const reconcileTriggerOptions = { triggerReconcile: false } await NatsAuthService.reissueUserForMicroservice(microservice.uuid, transaction, reconcileTriggerOptions) const refreshed = await MicroserviceManager.findOne({ uuid: microservice.uuid }, transaction) await _ensureNatsCredsForMicroservice(refreshed || microservice, transaction) diff --git a/src/services/nats-auth-service.js b/src/services/nats-auth-service.js index fc4c2d97..4b4e5543 100644 --- a/src/services/nats-auth-service.js +++ b/src/services/nats-auth-service.js @@ -13,7 +13,7 @@ const NatsUserRuleManager = require('../data/managers/nats-user-rule-manager') const MicroserviceManager = require('../data/managers/microservice-manager') const TransactionDecorator = require('../decorators/transaction-decorator') const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') -const { runInTransaction, PRIORITY_BACKGROUND } = require('../helpers/transaction-runner') +const { runInTransaction, PRIORITY_BACKGROUND, schedulePostCommitBackground } = require('../helpers/transaction-runner') const logger = require('../logger') const NatsSystemRules = require('../config/nats-system-rules') const { slugifyName } = require('../helpers/system-naming') @@ -246,10 +246,10 @@ async function _enqueueNatsReconcileOutbox (triggerOptions = {}, transaction) { } function _runBackgroundTask (label, task) { - setImmediate(async () => { + schedulePostCommitBackground(label, async (transaction) => { try { logger.info(`Starting background NATS task: ${label}`) - await task() + await task(transaction) logger.info(`Completed background NATS task: ${label}`) } catch (error) { logger.error(`Background NATS task failed (${label}): ${error.message}`) @@ -606,7 +606,8 @@ async function ensureControllerNatsAccount (transaction, ...rest) { return result } -async function ensureAccountForApplication (applicationId, transaction) { +async function ensureAccountForApplication (applicationId, transaction, ...rest) { + const options = _triggerOptionsFromArgs(rest) await ensureDefaultRules(transaction) const existing = await NatsAccountManager.findOne({ applicationId }, transaction) if (existing) { @@ -643,7 +644,14 @@ async function ensureAccountForApplication (applicationId, transaction) { isSystem: false, isLeafSystem: false }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: application.id }, transaction) + if (options.triggerReconcile !== false) { + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: application.id, + mutationKind: options.mutationKind || 'access-enable', + ...options + }, transaction) + } return created } @@ -841,7 +849,8 @@ async function createUserForAccount (accountId, userName, expiresIn, natsRuleNam return { account, user: natsUser } } -async function reissueAccountForApplication (applicationId, transaction) { +async function reissueAccountForApplication (applicationId, transaction, ...rest) { + const options = _triggerOptionsFromArgs(rest) await ensureDefaultRules(transaction) const application = await ApplicationManager.findOne({ id: applicationId }, transaction) if (!application) { @@ -870,7 +879,14 @@ async function reissueAccountForApplication (applicationId, transaction) { account.jwt ) await NatsAccountManager.update({ id: account.id }, { jwt: accountJwt }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId }, transaction) + if (options.triggerReconcile !== false) { + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId, + mutationKind: options.mutationKind || 'rule-change', + ...options + }, transaction) + } return NatsAccountManager.findOne({ id: account.id }, transaction) } @@ -914,7 +930,13 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res microserviceUuid: microservice.uuid, natsUserRuleId: currentRuleId }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: microservice.applicationId, + microserviceUuid: microservice.uuid, + mutationKind: options.mutationKind || 'access-enable', + ...options + }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -923,7 +945,13 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res if (oldAccount) { await _addRevocationToAccount(oldAccount, existingUser.publicKey, transaction) if (options.triggerReconcile !== false && oldAccount.applicationId != null) { - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: oldAccount.applicationId, ...options }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: oldAccount.applicationId, + microserviceUuid: microservice.uuid, + mutationKind: options.mutationKind || 'rule-change', + ...options + }, transaction) } } const accountSeed = await _loadSeedFromSecret(account.seedSecretName, transaction) @@ -945,7 +973,13 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res accountId: account.id, natsUserRuleId: currentRuleId }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: microservice.applicationId, + microserviceUuid: microservice.uuid, + mutationKind: options.mutationKind || 'rule-change', + ...options + }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -955,11 +989,25 @@ async function reissueUserForMicroservice (microserviceUuid, transaction, ...res const operatorSeed = await _loadSeedFromSecret(operator.seedSecretName, transaction) const operatorKp = fromSeed(new TextEncoder().encode(operatorSeed)) await _reissueOneUserForRule(existingUser, userRule.id, operatorKp, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: microservice.applicationId, + microserviceUuid: microservice.uuid, + mutationKind: options.mutationKind || 'rule-change', + ...options + }, transaction) return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: microservice.applicationId, ...options }, transaction) + if (options.triggerReconcile !== false) { + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: microservice.applicationId, + microserviceUuid: microservice.uuid, + mutationKind: options.mutationKind, + ...options + }, transaction) + } return NatsUserManager.findOne({ microserviceUuid: microservice.uuid }, transaction) } @@ -992,6 +1040,16 @@ async function ensureLeafUserForAccount (accountId, fogName, transaction, natsIn async function reissueForAccountRule (accountRuleId, transaction) { const rule = await NatsAccountRuleManager.findOne({ id: accountRuleId }, transaction) const applications = await ApplicationManager.findAll({ natsRuleId: accountRuleId }, transaction) + if (rule && rule.name === NatsSystemRules.APPLICATION_ACCOUNT_RULE_NAME) { + const defaultRuleApps = await ApplicationManager.findAll({ natsAccess: true, natsRuleId: null }, transaction) + const seenAppIds = new Set((applications || []).map((app) => app.id)) + for (const app of defaultRuleApps || []) { + if (!seenAppIds.has(app.id)) { + applications.push(app) + seenAppIds.add(app.id) + } + } + } logger.info(`Reissuing account JWTs for rule ${accountRuleId}`) for (const app of applications) { const account = await NatsAccountManager.findOne({ applicationId: app.id }, transaction) @@ -1035,7 +1093,11 @@ async function reissueForAccountRule (accountRuleId, transaction) { await NatsAccountManager.update({ id: relayAccount.id }, { jwt: accountJwt }, transaction) } } - await _enqueueNatsReconcileOutbox({ reason: 'account-rule-updated', accountRuleId }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'account-rule-updated', + accountRuleId, + mutationKind: 'rule-content-update' + }, transaction) } /** @@ -1091,7 +1153,18 @@ async function _reissueOneUserForRule (user, userRuleId, operatorKp, transaction } async function reissueForUserRule (userRuleId, transaction) { + const userRule = await NatsUserRuleManager.findOne({ id: userRuleId }, transaction) const microservices = await MicroserviceManager.findAll({ natsRuleId: userRuleId }, transaction) + if (userRule && userRule.name === NatsSystemRules.MICROSERVICE_USER_RULE_NAME) { + const defaultRuleMicroservices = await MicroserviceManager.findAll({ natsAccess: true, natsRuleId: null }, transaction) + const seenMsUuids = new Set((microservices || []).map((ms) => ms.uuid)) + for (const ms of defaultRuleMicroservices || []) { + if (!seenMsUuids.has(ms.uuid)) { + microservices.push(ms) + seenMsUuids.add(ms.uuid) + } + } + } logger.info(`Reissuing user JWTs for rule ${userRuleId}`) const operator = await ensureOperator(transaction) const operatorSeed = await _loadSeedFromSecret(operator.seedSecretName, transaction) @@ -1111,10 +1184,15 @@ async function reissueForUserRule (userRuleId, transaction) { await _reissueOneUserForRule(user, userRuleId, operatorKp, transaction) processedUserIds.add(user.id) } - await _enqueueNatsReconcileOutbox({ reason: 'user-rule-updated', userRuleId }, transaction) + await _enqueueNatsReconcileOutbox({ + reason: 'user-rule-updated', + userRuleId, + mutationKind: 'rule-content-update' + }, transaction) } -async function revokeMicroserviceUser (microserviceUuid, transaction) { +async function revokeMicroserviceUser (microserviceUuid, transaction, ...rest) { + const options = _triggerOptionsFromArgs(rest) const user = await NatsUserManager.findOne({ microserviceUuid }, transaction) if (!user) { return @@ -1152,10 +1230,18 @@ async function revokeMicroserviceUser (microserviceUuid, transaction) { // best-effort secret cleanup } await NatsUserManager.delete({ id: user.id }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-created', applicationId: account.applicationId }, transaction) + if (options.triggerReconcile !== false) { + await _enqueueNatsReconcileOutbox({ + reason: 'account-created', + applicationId: account.applicationId, + microserviceUuid, + mutationKind: 'access-disable' + }, transaction) + } } -async function deleteAccountForApplication (applicationId, transaction) { +async function deleteAccountForApplication (applicationId, transaction, ...rest) { + const options = _triggerOptionsFromArgs(rest) const account = await NatsAccountManager.findOne({ applicationId }, transaction) if (!account) { return @@ -1177,7 +1263,14 @@ async function deleteAccountForApplication (applicationId, transaction) { // best-effort cleanup } await NatsAccountManager.delete({ id: account.id }, transaction) - await _enqueueNatsReconcileOutbox({ reason: 'account-deleted', applicationId }, transaction) + if (options.triggerReconcile !== false) { + await _enqueueNatsReconcileOutbox({ + reason: 'account-deleted', + applicationId, + mutationKind: options.mutationKind || 'access-disable', + ...options + }, transaction) + } } async function revokeUserByAccountAndName (accountId, userName, transaction) { @@ -1241,46 +1334,40 @@ async function revokeUserByAccountAndName (accountId, userName, transaction) { } function scheduleRotateOperator () { - _runBackgroundTask('rotate-operator', async () => { - await module.exports.rotateOperator() + _runBackgroundTask('rotate-operator', async (transaction) => { + await module.exports.rotateOperator(transaction) }) return { scheduled: true } } function scheduleReissueForAccountRule (accountRuleId) { - _runBackgroundTask(`reissue-account-rule-${accountRuleId}`, async () => { - await module.exports.reissueForAccountRule(accountRuleId) - }) - _enqueueNatsReconcileOutbox({ reason: 'account-rule-updated', accountRuleId }).catch((err) => { - logger.error(`NATS reconcile outbox enqueue failed: ${err.message}`) + _runBackgroundTask(`reissue-account-rule-${accountRuleId}`, async (transaction) => { + await module.exports.reissueForAccountRule(accountRuleId, transaction) }) return { scheduled: true } } function scheduleReissueForUserRule (userRuleId) { - _runBackgroundTask(`reissue-user-rule-${userRuleId}`, async () => { - await module.exports.reissueForUserRule(userRuleId) - }) - _enqueueNatsReconcileOutbox({ reason: 'user-rule-updated', userRuleId }).catch((err) => { - logger.error(`NATS reconcile outbox enqueue failed: ${err.message}`) + _runBackgroundTask(`reissue-user-rule-${userRuleId}`, async (transaction) => { + await module.exports.reissueForUserRule(userRuleId, transaction) }) return { scheduled: true } } function scheduleReissueAccountsForApplications (applicationIds = []) { - _runBackgroundTask(`reissue-accounts-${applicationIds.length}`, async () => { + _runBackgroundTask(`reissue-accounts-${applicationIds.length}`, async (transaction) => { for (const applicationId of applicationIds) { - await module.exports.reissueAccountForApplication(applicationId) + await module.exports.reissueAccountForApplication(applicationId, transaction) } }) return { scheduled: true } } function scheduleReissueUsersForMicroservices (microserviceUuids = []) { - _runBackgroundTask(`reissue-users-${microserviceUuids.length}`, async () => { + _runBackgroundTask(`reissue-users-${microserviceUuids.length}`, async (transaction) => { for (const microserviceUuid of microserviceUuids) { const reconcileTriggerOptions = { triggerReconcile: false } - await module.exports.reissueUserForMicroservice(microserviceUuid, reconcileTriggerOptions) + await module.exports.reissueUserForMicroservice(microserviceUuid, transaction, reconcileTriggerOptions) } }) return { scheduled: true } @@ -1322,5 +1409,6 @@ module.exports = { scheduleReissueForAccountRule, scheduleReissueForUserRule, scheduleReissueAccountsForApplications, - scheduleReissueUsersForMicroservices + scheduleReissueUsersForMicroservices, + enqueueNatsReconcileOutbox: _enqueueNatsReconcileOutbox } diff --git a/test/src/data/reconcile-outbox.test.js b/test/src/data/reconcile-outbox.test.js index d1c498d3..acfe3839 100644 --- a/test/src/data/reconcile-outbox.test.js +++ b/test/src/data/reconcile-outbox.test.js @@ -122,7 +122,7 @@ describe('reconcile-outbox', () => { await runInTransaction(async (transaction) => { const row = await ReconcileOutbox.findOne({ - where: { idempotencyKey: 'nats:cluster-routes-changed:null:null:null:fog-other' } + where: { idempotencyKey: 'nats:cluster-routes-changed:null:null:null:null:null:null:fog-other' } }, transaction) await ReconcileOutboxManager.markProcessed(row.id, transaction) }) diff --git a/test/src/helpers/reconcile-outbox-keys.test.js b/test/src/helpers/reconcile-outbox-keys.test.js index 1ec8b5e4..ac7cfa79 100644 --- a/test/src/helpers/reconcile-outbox-keys.test.js +++ b/test/src/helpers/reconcile-outbox-keys.test.js @@ -36,7 +36,7 @@ describe('reconcile-outbox-keys', () => { userRuleId: null, fogUuids: ['b', 'a'] }) - expect(key).to.equal('nats:cluster-routes-changed:null:null:null:a,b') + expect(key).to.equal('nats:cluster-routes-changed:null:null:null:null:null:null:a,b') }) it('builds nats keys without fog uuids from scope fields', () => { @@ -46,7 +46,23 @@ describe('reconcile-outbox-keys', () => { accountRuleId: null, userRuleId: null }) - expect(key).to.equal('nats:account-created:42:null:null') + expect(key).to.equal('nats:account-created:42:null:null:null:null:null') + }) + + it('distinguishes distinct mutations via microserviceUuid and mutationKind', () => { + const enableKey = buildNatsIdempotencyKey({ + reason: 'account-created', + applicationId: 42, + microserviceUuid: 'ms-a', + mutationKind: 'access-enable' + }) + const ruleKey = buildNatsIdempotencyKey({ + reason: 'account-created', + applicationId: 42, + microserviceUuid: 'ms-a', + mutationKind: 'rule-change' + }) + expect(enableKey).to.not.equal(ruleKey) }) it('routes buildIdempotencyKey by kind', () => { diff --git a/test/src/services/microservices-service.test.js b/test/src/services/microservices-service.test.js index 6d8a8ac7..920b1d0b 100644 --- a/test/src/services/microservices-service.test.js +++ b/test/src/services/microservices-service.test.js @@ -22,6 +22,7 @@ const MicroserviceCdiDevManager = require('../../../src/data/managers/microservi const MicroserviceCapAddManager = require('../../../src/data/managers/microservice-cap-add-manager') const MicroserviceCapDropManager = require('../../../src/data/managers/microservice-cap-drop-manager') const MicroserviceHealthCheckManager = require('../../../src/data/managers/microservice-healthcheck-manager') +const VolumeMountService = require('../../../src/services/volume-mount-service') const RbacRoleManager = require('../../../src/data/managers/rbac-role-manager') const RbacServiceAccountManager = require('../../../src/data/managers/rbac-service-account-manager') const NatsAuthService = require('../../../src/services/nats-auth-service') @@ -343,6 +344,45 @@ describe('Microservices Service', () => { it('rejects updates', () => expect($subject).to.be.rejectedWith(Errors.ValidationError)) }) + context('when disabling natsAccess via natsConfig', () => { + const natsEnabled = buildMicroserviceRecord({ + uuid: msvcUuid, + name: 'immutable-name', + catalogItem: null, + natsAccess: true, + natsCredsSecretName: 'nats-creds-msvc' + }) + + def('updateData', () => ({ natsConfig: { natsAccess: false } })) + + beforeEach(() => { + MicroserviceManager.findOne.resolves(natsEnabled) + MicroserviceManager.findOneWithCategory.resolves({ + ...natsEnabled, + catalogItem: null, + getPorts: () => Promise.resolve([]), + getImages: () => Promise.resolve([]) + }) + MicroserviceManager.updateAndFind.resolves({ ...natsEnabled, natsAccess: false }) + $sandbox.stub(NatsAuthService, 'revokeMicroserviceUser').resolves() + $sandbox.stub(NatsAuthService, 'reissueUserForMicroservice').resolves() + $sandbox.stub(NatsAuthService, 'ensureUserForMicroservice').resolves() + $sandbox.stub(MicroserviceEnvManager, 'delete').resolves() + $sandbox.stub(VolumeMountService, 'unlinkVolumeMountEndpoint').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + }) + + it('revokes credentials and does not reissue when disabling', async () => { + await $subject + + expect(NatsAuthService.revokeMicroserviceUser).to.have.been.calledOnceWith(msvcUuid, transaction) + expect(NatsAuthService.reissueUserForMicroservice).to.not.have.been.called + expect(NatsAuthService.ensureUserForMicroservice).to.not.have.been.called + expect(MicroserviceEnvManager.delete).to.have.been.calledOnce + expect(VolumeMappingManager.delete).to.have.been.calledOnce + }) + }) + context('when volumeMappings include a system serviceAccount volume', () => { const userVolume = { hostDestination: 'nats-creds-data', diff --git a/test/src/services/nats-auth-orchestration.test.js b/test/src/services/nats-auth-orchestration.test.js new file mode 100644 index 00000000..ae0970f1 --- /dev/null +++ b/test/src/services/nats-auth-orchestration.test.js @@ -0,0 +1,33 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ReconcileOutboxManager = require('../../../src/data/managers/reconcile-outbox-manager') +const NatsAuthService = require('../../../src/services/nats-auth-service') + +describe('NATS auth orchestration', () => { + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => { + $sandbox.restore() + }) + + describe('scheduleReissueForAccountRule', () => { + it('does not enqueue reconcile outbox synchronously when scheduling reissue', () => { + const enqueueSpy = $sandbox.spy(ReconcileOutboxManager, 'enqueueNats') + + NatsAuthService.scheduleReissueForAccountRule(7) + + expect(enqueueSpy).to.not.have.been.called + }) + }) + + describe('scheduleReissueForUserRule', () => { + it('does not enqueue reconcile outbox synchronously when scheduling reissue', () => { + const enqueueSpy = $sandbox.spy(ReconcileOutboxManager, 'enqueueNats') + + NatsAuthService.scheduleReissueForUserRule(9) + + expect(enqueueSpy).to.not.have.been.called + }) + }) +}) From 7abb8e49bbb4a89665b1ef2364737d1c4eb31f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:33 +0300 Subject: [PATCH 18/32] Rebuild NATS resolver bundles from fresh auth state after reissue. Refresh account JWT lookups before leaf bundle writes, include default-rule consumers in fan-out, and extend outbox idempotency keys for microservice and mutation scope. --- src/services/nats-service.js | 59 ++++++++++- .../src/services/nats-resolver-bundle.test.js | 100 ++++++++++++++++++ test/src/services/nats-service.test.js | 2 + 3 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 test/src/services/nats-resolver-bundle.test.js diff --git a/src/services/nats-service.js b/src/services/nats-service.js index 2147bee8..7a52447c 100644 --- a/src/services/nats-service.js +++ b/src/services/nats-service.js @@ -26,7 +26,10 @@ const NatsReconcileTaskManager = require('../data/managers/nats-reconcile-task-m const ReconcileOutboxManager = require('../data/managers/reconcile-outbox-manager') const NatsUserManager = require('../data/managers/nats-user-manager') const ApplicationManager = require('../data/managers/application-manager') +const NatsAccountRuleManager = require('../data/managers/nats-account-rule-manager') +const NatsUserRuleManager = require('../data/managers/nats-user-rule-manager') const NatsAuthService = require('./nats-auth-service') +const NatsSystemRules = require('../config/nats-system-rules') const ChangeTrackingService = require('./change-tracking-service') const MicroservicesService = require('./microservices-service') const FogManager = require('../data/managers/iofog-manager') @@ -1499,10 +1502,14 @@ function _getAffectedFogUuidsForApplication (applicationId, natsInstanceByFog, m return out } -function _getAffectedFogUuidsForAccountRule (accountRuleId, natsInstanceByFog, microservicesByFog, applicationsWithNatsById) { +function _getAffectedFogUuidsForAccountRule (accountRuleId, natsInstanceByFog, microservicesByFog, applicationsWithNatsById, defaultAccountRuleId) { const appIds = [] for (const [appId, app] of applicationsWithNatsById) { - if (app.natsRuleId === accountRuleId) appIds.push(appId) + if (app.natsRuleId === accountRuleId) { + appIds.push(appId) + } else if (defaultAccountRuleId && accountRuleId === defaultAccountRuleId && app.natsRuleId == null && app.natsAccess) { + appIds.push(appId) + } } const out = new Set() for (const [fogUuid, ni] of natsInstanceByFog) { @@ -1519,7 +1526,19 @@ async function _getAffectedFogUuidsForUserRule (userRuleId, natsInstanceByFog, t for (const [fogUuid, ni] of natsInstanceByFog) { if (!ni.isLeaf) out.add(fogUuid) } - const microservicesWithRule = await MicroserviceManager.findAll({ natsRuleId: userRuleId }, transaction) + const userRule = await NatsUserRuleManager.findOne({ id: userRuleId }, transaction) + let microservicesWithRule = await MicroserviceManager.findAll({ natsRuleId: userRuleId }, transaction) + if (userRule && userRule.name === NatsSystemRules.MICROSERVICE_USER_RULE_NAME) { + const defaultRuleMicroservices = await MicroserviceManager.findAll({ natsAccess: true, natsRuleId: null }, transaction) + const seenMsUuids = new Set((microservicesWithRule || []).map((ms) => ms.uuid)) + microservicesWithRule = [...(microservicesWithRule || [])] + for (const ms of defaultRuleMicroservices || []) { + if (!seenMsUuids.has(ms.uuid)) { + microservicesWithRule.push(ms) + seenMsUuids.add(ms.uuid) + } + } + } for (const ms of microservicesWithRule || []) { if (ms.iofogUuid) out.add(ms.iofogUuid) } @@ -1568,13 +1587,23 @@ async function _reconcileResolverArtifactsOnceDb (options = {}, transaction) { let candidateFogs const fogFilter = Array.isArray(options.fogUuids) && options.fogUuids.length > 0 ? new Set(options.fogUuids) : null const reason = options.reason || 'auth-mutation' + const defaultAccountRule = await NatsAccountRuleManager.findOne({ + name: NatsSystemRules.APPLICATION_ACCOUNT_RULE_NAME + }, transaction) + const defaultAccountRuleId = defaultAccountRule ? defaultAccountRule.id : null if (fogFilter) { candidateFogs = fogs.filter((fog) => fogFilter.has(fog.uuid)) } else if ((reason === 'account-created' || reason === 'account-deleted') && options.applicationId != null) { const affected = _getAffectedFogUuidsForApplication(options.applicationId, natsInstanceByFog, microservicesByFog) candidateFogs = fogs.filter((f) => affected.has(f.uuid)) } else if (reason === 'account-rule-updated' && options.accountRuleId != null) { - const affected = _getAffectedFogUuidsForAccountRule(options.accountRuleId, natsInstanceByFog, microservicesByFog, applicationsWithNatsById) + const affected = _getAffectedFogUuidsForAccountRule( + options.accountRuleId, + natsInstanceByFog, + microservicesByFog, + applicationsWithNatsById, + defaultAccountRuleId + ) candidateFogs = fogs.filter((f) => affected.has(f.uuid)) } else if (reason === 'user-rule-updated' && options.userRuleId != null) { const affected = await _getAffectedFogUuidsForUserRule(options.userRuleId, natsInstanceByFog, transaction) @@ -1633,12 +1662,22 @@ async function _reconcileResolverArtifactsOnceDb (options = {}, transaction) { const fogMicroservices = microservicesByFog.get(fog.uuid) || [] if (!skipReissueForAccountDeleted) { + const affectedAppIds = new Set() for (const microservice of fogMicroservices) { if (!microservice.natsAccess || !microservice.applicationId) continue const app = applicationsWithNatsById.get(microservice.applicationId) if (!app || !app.natsAccess) continue + affectedAppIds.add(microservice.applicationId) await NatsAuthServiceRuntime.reissueUserForMicroservice(microservice.uuid, transaction, reconcileTriggerOptions) } + if (affectedAppIds.size > 0) { + const refreshedAccounts = await NatsAccountManager.findAll({ + applicationId: { [Op.in]: [...affectedAppIds] } + }, transaction) + for (const account of refreshedAccounts || []) { + accountByAppId.set(account.applicationId, account) + } + } } const natsInstance = natsInstanceByFog.get(fog.uuid) @@ -1724,6 +1763,10 @@ async function _computeAffectedFogUuidsForEnqueue (options, transaction) { } } const fogUuids = fogs.map((f) => f.uuid) + const defaultAccountRule = await NatsAccountRuleManager.findOne({ + name: NatsSystemRules.APPLICATION_ACCOUNT_RULE_NAME + }, transaction) + const defaultAccountRuleId = defaultAccountRule ? defaultAccountRule.id : null if (reason === 'server-deleted' || reason === 'cluster-routes-changed') { return [] } @@ -1742,7 +1785,13 @@ async function _computeAffectedFogUuidsForEnqueue (options, transaction) { return fogUuids.filter((u) => affected.has(u)) } if (reason === 'account-rule-updated' && options.accountRuleId != null) { - const affected = _getAffectedFogUuidsForAccountRule(options.accountRuleId, natsInstanceByFog, microservicesByFog, applicationsWithNatsById) + const affected = _getAffectedFogUuidsForAccountRule( + options.accountRuleId, + natsInstanceByFog, + microservicesByFog, + applicationsWithNatsById, + defaultAccountRuleId + ) return fogUuids.filter((u) => affected.has(u)) } if (reason === 'user-rule-updated' && options.userRuleId != null) { diff --git a/test/src/services/nats-resolver-bundle.test.js b/test/src/services/nats-resolver-bundle.test.js new file mode 100644 index 00000000..0c7a1a53 --- /dev/null +++ b/test/src/services/nats-resolver-bundle.test.js @@ -0,0 +1,100 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const { Op } = require('sequelize') + +const NatsAccountManager = require('../../../src/data/managers/nats-account-manager') +const NatsInstanceManager = require('../../../src/data/managers/nats-instance-manager') +const NatsAccountRuleManager = require('../../../src/data/managers/nats-account-rule-manager') +const NatsConnectionManager = require('../../../src/data/managers/nats-connection-manager') +const ConfigMapManager = require('../../../src/data/managers/config-map-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const ApplicationManager = require('../../../src/data/managers/application-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const ConfigMapService = require('../../../src/services/config-map-service') +const NatsAuthService = require('../../../src/services/nats-auth-service') +const config = require('../../../src/config') +const transactionRunner = require('../../../src/helpers/transaction-runner') + +describe('NATS resolver bundle freshness', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => { + $sandbox.restore() + }) + + it('uses refreshed account JWT in leaf bundle after in-reconcile reissue', async () => { + const fog = { uuid: 'fog-leaf-1', name: 'leaf-fog' } + const app = { id: 10, natsAccess: true, isSystem: false, natsRuleId: null } + const microservice = { + uuid: 'ms-1', + applicationId: 10, + iofogUuid: fog.uuid, + natsAccess: true + } + const staleAccount = { + id: 100, + applicationId: 10, + publicKey: 'STALEPK', + jwt: 'stale.jwt.token' + } + const freshAccount = { + id: 100, + applicationId: 10, + publicKey: 'STALEPK', + jwt: 'fresh.jwt.token' + } + + $sandbox.stub(config, 'getBoolean').returns(false) + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => defaultValue) + $sandbox.stub(transactionRunner, 'isSqliteProvider').returns(false) + $sandbox.stub(transactionRunner, 'runInTransaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(FogManager, 'findAll').resolves([fog]) + $sandbox.stub(ApplicationManager, 'findAll').resolves([app]) + $sandbox.stub(NatsInstanceManager, 'findAll').resolves([{ iofogUuid: fog.uuid, isLeaf: true, isHub: false }]) + $sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([]) + $sandbox.stub(MicroserviceManager, 'findAll').callsFake((query) => { + if (query.name) { + return Promise.resolve([{ uuid: 'nats-ms', iofogUuid: fog.uuid, name: 'nats' }]) + } + return Promise.resolve([microservice]) + }) + $sandbox.stub(NatsAccountRuleManager, 'findOne').resolves({ id: 1, name: 'default-account' }) + $sandbox.stub(NatsAccountManager, 'findAll').callsFake((query) => { + if (query.isSystem) { + return Promise.resolve([]) + } + if (query.applicationId && query.applicationId[Op.in]) { + return Promise.resolve([freshAccount]) + } + return Promise.resolve([staleAccount]) + }) + $sandbox.stub(NatsAccountManager, 'findOne').resolves(null) + $sandbox.stub(NatsAuthService, 'ensureSystemAccount').resolves() + $sandbox.stub(NatsAuthService, 'ensureLeafSystemAccount').resolves({ + publicKey: 'LEAFSYS', + jwt: 'leaf-sys.jwt' + }) + $sandbox.stub(NatsAuthService, 'reissueUserForMicroservice').callsFake(async () => { + staleAccount.jwt = freshAccount.jwt + }) + $sandbox.stub(ConfigMapManager, 'getConfigMap').resolves(null) + + const capturedBundles = [] + $sandbox.stub(ConfigMapService, 'createConfigMapEndpoint').callsFake(async (payload) => { + capturedBundles.push(payload.data) + return payload + }) + + const NatsService = require('../../../src/services/nats-service') + await NatsService.reconcileResolverArtifacts({ + reason: 'account-created', + applicationId: 10, + fogUuids: [fog.uuid] + }) + + const leafBundle = capturedBundles.find((bundle) => bundle && bundle['STALEPK.jwt']) + expect(leafBundle).to.not.equal(undefined) + expect(leafBundle['STALEPK.jwt']).to.equal('fresh.jwt.token') + }) +}) diff --git a/test/src/services/nats-service.test.js b/test/src/services/nats-service.test.js index 9258f955..4dcb11ae 100644 --- a/test/src/services/nats-service.test.js +++ b/test/src/services/nats-service.test.js @@ -5,6 +5,7 @@ const NatsService = require('../../../src/services/nats-service') const NatsInstanceManager = require('../../../src/data/managers/nats-instance-manager') const NatsConnectionManager = require('../../../src/data/managers/nats-connection-manager') const NatsAccountManager = require('../../../src/data/managers/nats-account-manager') +const NatsAccountRuleManager = require('../../../src/data/managers/nats-account-rule-manager') const NatsUserManager = require('../../../src/data/managers/nats-user-manager') const MicroserviceManager = require('../../../src/data/managers/microservice-manager') const VolumeMappingManager = require('../../../src/data/managers/volume-mapping-manager') @@ -379,6 +380,7 @@ describe('NATS Service', () => { $sandbox.stub(require('../../../src/data/managers/iofog-manager'), 'findAll').resolves([]) $sandbox.stub(require('../../../src/data/managers/application-manager'), 'findAll').resolves([]) $sandbox.stub(NatsInstanceManager, 'findAll').resolves([]) + $sandbox.stub(NatsAccountRuleManager, 'findOne').resolves({ id: 1, name: 'default-account' }) $sandbox.stub(NatsAccountManager, 'findOne').resolves({ id: 1, isSystem: true }) $sandbox.stub(require('../../../src/services/nats-auth-service'), 'ensureSystemAccount').resolves() $sandbox.stub(ConfigMapManager, 'getConfigMap').resolves(null) From 91f36f8ae2b8ba7c6cfa3886153b2fc30b87faf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:37 +0300 Subject: [PATCH 19/32] Propagate fog upstream router and NATS endpoint changes to downstream fogs. Build router microservice config from live router DB state, detect upstream connector drift, fan out platform reconcile on host/port changes, and preserve upstreamNatsServers when omitted from PATCH. --- .../managers/router-connection-manager.js | 5 +- src/services/fog-platform-service.js | 110 +++++++++++++- src/services/iofog-service.js | 4 +- src/services/router-service.js | 46 +++++- src/services/service-bridge-config.js | 1 + .../src/services/fog-platform-service.test.js | 139 +++++++++++++++++- test/src/services/iofog-service.test.js | 40 +++++ test/src/services/router-service.test.js | 26 ++++ 8 files changed, 356 insertions(+), 15 deletions(-) diff --git a/src/data/managers/router-connection-manager.js b/src/data/managers/router-connection-manager.js index c6d9ae84..0729dae8 100644 --- a/src/data/managers/router-connection-manager.js +++ b/src/data/managers/router-connection-manager.js @@ -22,8 +22,9 @@ class RouterConnectionManager extends BaseManager { required: true } ], - where - }, { transaction }) + where, + transaction + }) } } diff --git a/src/services/fog-platform-service.js b/src/services/fog-platform-service.js index 60ba59bd..9b9c2049 100644 --- a/src/services/fog-platform-service.js +++ b/src/services/fog-platform-service.js @@ -124,6 +124,88 @@ function topologyChanged (before, after) { before.upstreamNatsServers !== after.upstreamNatsServers } +function serializeEndpointSnapshot (snapshot) { + return JSON.stringify(snapshot || {}) +} + +function endpointsChanged (before, after) { + return serializeEndpointSnapshot(before) !== serializeEndpointSnapshot(after) +} + +async function captureEndpointSnapshot (fogUuid, fog, spec, transaction) { + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const nats = await NatsInstanceManager.findByFog(fogUuid, transaction) + const host = spec.host != null ? spec.host : (fog ? fog.host : null) + + return { + host: host || '', + routerHost: router ? (router.host || '') : '', + natsHost: nats ? (nats.host || '') : '', + messagingPort: String(spec.messagingPort ?? (router ? router.messagingPort : '')), + interRouterPort: String(spec.interRouterPort ?? (router ? router.interRouterPort : '')), + edgeRouterPort: String(spec.edgeRouterPort ?? (router ? router.edgeRouterPort : '')), + natsServerPort: String(spec.natsServerPort ?? (nats ? nats.serverPort : '')), + natsLeafPort: String(spec.natsLeafPort ?? (nats ? nats.leafPort : '')), + natsClusterPort: String(spec.natsClusterPort ?? (nats ? nats.clusterPort : '')), + natsMqttPort: String(spec.natsMqttPort ?? (nats ? nats.mqttPort : '')), + natsHttpPort: String(spec.natsHttpPort ?? (nats ? nats.httpPort : '')) + } +} + +async function getDownstreamFogUuidsForUpstream (fogUuid, transaction) { + const downstreamUuids = new Set() + + const upstreamRouter = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + if (upstreamRouter) { + const downstreamConnections = await RouterConnectionManager.findAllWithRouters( + { destRouter: upstreamRouter.id }, + transaction + ) + for (const connection of downstreamConnections || []) { + if (connection.source && connection.source.iofogUuid) { + downstreamUuids.add(connection.source.iofogUuid) + } + } + } + + const upstreamNats = await NatsInstanceManager.findByFog(fogUuid, transaction) + if (upstreamNats) { + const downstreamConnections = await NatsConnectionManager.findAllWithNats( + { destNats: upstreamNats.id }, + transaction + ) + for (const connection of downstreamConnections || []) { + if (connection.source && connection.source.iofogUuid) { + downstreamUuids.add(connection.source.iofogUuid) + } + } + } + + return [...downstreamUuids] +} + +async function resolveNatsConfigFromSpec (fogUuid, spec, transaction) { + const natsConfig = buildNatsConfig(spec) + if (spec.upstreamNatsServers !== undefined) { + return natsConfig + } + + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const nats = await NatsInstanceManager.findByFog(fogUuid, transaction) + if (!nats) { + return natsConfig + } + + const connections = await NatsConnectionManager.findAllWithNats({ sourceNats: nats.id }, transaction) + if (connections && connections.length > 0) { + natsConfig.upstreamNatsServers = connections.map( + (connection) => _getNatsUuid(connection.dest, defaultHub) + ) + } + + return natsConfig +} + function truncateErrorMessage (errorMessage, maxLength = 200) { return errorMessage.length > maxLength ? errorMessage.slice(0, maxLength) : errorMessage } @@ -174,6 +256,7 @@ async function reconcileFogPrepare (fogUuid, transaction) { const spec = parsedSpec.spec const fogData = buildFogDataFromSpecAndFog(fog, spec) const topologyBefore = await captureTopologySnapshot(fogUuid, transaction) + const endpointsBefore = await captureEndpointSnapshot(fogUuid, fog, spec, transaction) await FogPlatformStatusManager.setPhase(fogUuid, 'Progressing', { lastError: null }, transaction) validateSystemFogInvariants(fog, spec) @@ -191,9 +274,10 @@ async function reconcileFogPrepare (fogUuid, transaction) { fogData, generation, topologyBefore, + endpointsBefore, shouldRecreateCerts, isHostChanged, - natsConfig: buildNatsConfig(spec), + natsConfig: await resolveNatsConfigFromSpec(fogUuid, spec, transaction), isFirstReconcile: !status || status.observedGeneration === 0, router } @@ -292,7 +376,12 @@ async function reconcileFogPlatform (fogUuid, prep, transaction) { }, upstreamRouters, spec.containerEngine || fog.containerEngine, transaction) } - const baseRouterConfig = await IofogService._getRouterMicroserviceConfig(fogUuid, transaction) + const activeRouterId = networkRouter.id ?? router.id + const baseRouterConfig = await RouterService.buildFreshRouterMicroserviceConfig( + activeRouterId, + spec.containerEngine || fog.containerEngine, + transaction + ) await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseRouterConfig, transaction) } @@ -338,6 +427,17 @@ async function reconcileFogFinalize (fogUuid, prep, platformResult, transaction) }, transaction) } + const endpointsAfter = await captureEndpointSnapshot(fogUuid, prep.fog, prep.spec, transaction) + if (endpointsChanged(prep.endpointsBefore, endpointsAfter)) { + const downstreamUuids = await getDownstreamFogUuidsForUpstream(fogUuid, transaction) + for (const downstreamUuid of downstreamUuids) { + await ReconcileOutboxManager.enqueueFogPlatform({ + fogUuid: downstreamUuid, + reason: 'spec-changed' + }, transaction) + } + } + await FogPlatformStatusManager.setPhase(fogUuid, 'Ready', { observedGeneration: generation, lastError: null, @@ -347,7 +447,7 @@ async function reconcileFogFinalize (fogUuid, prep, platformResult, transaction) await FogManager.update({ uuid: fogUuid }, { warningMessage: 'HEALTHY' }, transaction) return { - networkRouterId: platformResult.networkRouter ? platformResult.networkRouter.id : null + networkRouterId: (platformResult.networkRouter && platformResult.networkRouter.id) || (routerAfter && routerAfter.id) || null } } @@ -449,6 +549,10 @@ module.exports = { buildFogDataFromSpecAndFog, validateSystemFogInvariants, captureTopologySnapshot, + captureEndpointSnapshot, + endpointsChanged, + getDownstreamFogUuidsForUpstream, + resolveNatsConfigFromSpec, topologyChanged, markReconcileFailed, reconcileFogPrepare, diff --git a/src/services/iofog-service.js b/src/services/iofog-service.js index 5c863d9f..affde2fa 100644 --- a/src/services/iofog-service.js +++ b/src/services/iofog-service.js @@ -567,7 +567,9 @@ async function updateFogEndPoint (fogData, isCLI, transaction) { await FogManager.update(queryFogData, updateFogData, transaction) await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.config, transaction) - const mergedSpec = mergePlatformSpecPatch(parsedSpec ? parsedSpec.spec : {}, fogData) + const existingSpec = parsedSpec ? parsedSpec.spec : {} + const mergedSpec = mergePlatformSpecPatch(existingSpec, fogData) + const { generation } = await FogPlatformSpecManager.upsertSpec(fogData.uuid, mergedSpec, transaction) await FogPlatformStatusManager.ensurePending(fogData.uuid, transaction) await ReconcileOutboxManager.enqueueFogPlatform({ diff --git a/src/services/router-service.js b/src/services/router-service.js index 8241dc1e..38051f1b 100644 --- a/src/services/router-service.js +++ b/src/services/router-service.js @@ -205,13 +205,22 @@ async function _updateRouterPorts (routerMicroserviceUuid, router, transaction) } } -async function updateConfig (routerID, containerEngine, transaction) { - const router = await RouterManager.findOne({ id: routerID }, transaction) +function _upstreamConnectorsFingerprint (connectors) { + if (!connectors || typeof connectors !== 'object') { + return '' + } + return Object.keys(connectors).sort().map((name) => { + const connector = connectors[name] + return `${name}:${connector.host}:${connector.port}:${connector.role}:${connector.sslProfile || ''}` + }).join('|') +} + +async function buildFreshRouterMicroserviceConfig (routerID, containerEngine, transaction, routerArg = null) { + const router = routerArg || await RouterManager.findOne({ id: routerID }, transaction) if (!router) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER, routerID)) } - // Get current configuration const routerCatalog = await CatalogService.getRouterCatalogItem(transaction) const routerMicroservice = await MicroserviceManager.findOne({ catalogItemId: routerCatalog.id, @@ -223,8 +232,6 @@ async function updateConfig (routerID, containerEngine, transaction) { } const currentConfig = JSON.parse(routerMicroservice.config || '{}') - - // Generate new configuration const newConfig = await _getRouterMicroserviceConfig( router.isEdge, router.iofogUuid, @@ -235,7 +242,6 @@ async function updateConfig (routerID, containerEngine, transaction) { transaction ) - // Add connectors for upstream routers const upstreamRoutersConnections = await RouterConnectionManager.findAllWithRouters( { sourceRouter: router.id }, transaction @@ -256,11 +262,36 @@ async function updateConfig (routerID, containerEngine, transaction) { newConfig.bridges = JSON.parse(JSON.stringify(currentConfig.bridges)) } + return newConfig +} + +async function updateConfig (routerID, containerEngine, transaction) { + const router = await RouterManager.findOne({ id: routerID }, transaction) + if (!router) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER, routerID)) + } + + const routerCatalog = await CatalogService.getRouterCatalogItem(transaction) + const routerMicroservice = await MicroserviceManager.findOne({ + catalogItemId: routerCatalog.id, + iofogUuid: router.iofogUuid + }, transaction) + + if (!routerMicroservice) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER, router.id)) + } + + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + const newConfig = await buildFreshRouterMicroserviceConfig(routerID, containerEngine, transaction, router) + await _ensureRouterTlsVolumeMountsAndMappings(router.iofogUuid, routerMicroservice.uuid, transaction, true) await ChangeTrackingService.update(router.iofogUuid, ChangeTrackingService.events.microserviceConfig, transaction) + const upstreamFingerprintChanged = _upstreamConnectorsFingerprint(currentConfig.connectors) !== + _upstreamConnectorsFingerprint(newConfig.connectors) + // Check if configuration needs update - if (JSON.stringify(currentConfig) !== JSON.stringify(newConfig)) { + if (JSON.stringify(currentConfig) !== JSON.stringify(newConfig) || upstreamFingerprintChanged) { await MicroserviceManager.update( { uuid: routerMicroservice.uuid }, { config: JSON.stringify(newConfig) }, @@ -674,6 +705,7 @@ async function findOne (option, transaction) { } module.exports = { + buildFreshRouterMicroserviceConfig: TransactionDecorator.generateTransaction(buildFreshRouterMicroserviceConfig), createRouterForFog: TransactionDecorator.generateTransaction(createRouterForFog), updateConfig: TransactionDecorator.generateTransaction(updateConfig), updateRouter: TransactionDecorator.generateTransaction(updateRouter), diff --git a/src/services/service-bridge-config.js b/src/services/service-bridge-config.js index 07daf7e1..6624f05c 100644 --- a/src/services/service-bridge-config.js +++ b/src/services/service-bridge-config.js @@ -46,6 +46,7 @@ async function _resolveFogTagValues (fogUuid, transaction) { } async function recomputeServiceBridgeConfig (fogUuid, baseConfig, transaction) { + // baseConfig must be freshly built from router DB state (connectors/upstreams), not stale MS JSON. let config = stripServiceDerivedBridges(baseConfig) const tagValues = await _resolveFogTagValues(fogUuid, transaction) diff --git a/test/src/services/fog-platform-service.test.js b/test/src/services/fog-platform-service.test.js index 6c77c8a7..87385414 100644 --- a/test/src/services/fog-platform-service.test.js +++ b/test/src/services/fog-platform-service.test.js @@ -112,9 +112,16 @@ describe('Fog platform service', () => { $sandbox.stub(NatsService, 'ensureNatsForFogPhased').resolves({}) $sandbox.stub(NatsService, 'cleanupNatsForFogPhased').resolves() $sandbox.stub(ReconcileOutboxManager, 'enqueueNats').resolves() + $sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() $sandbox.stub(RouterService, 'validateAndReturnUpstreamRouters').resolves([]) - $sandbox.stub(RouterService, 'updateRouter').resolves(router) - $sandbox.stub(IofogService, '_getRouterMicroserviceConfig').resolves({ bridges: { tcpListeners: {}, tcpConnectors: {} } }) + $sandbox.stub(RouterService, 'updateRouter').resolves({ + host: 'localhost', + messagingPort: router.messagingPort + }) + $sandbox.stub(RouterService, 'buildFreshRouterMicroserviceConfig').resolves({ + connectors: { 'default-router': { name: 'default-router', host: '10.0.0.1', port: '55671', role: 'edge' } }, + bridges: { tcpListeners: {}, tcpConnectors: {} } + }) $sandbox.stub(ServiceBridgeConfig, 'recomputeServiceBridgeConfig').resolves({ bridges: { tcpListeners: {}, tcpConnectors: {} } }) $sandbox.stub(ChangeTrackingService, 'create').resolves() $sandbox.stub(ChangeTrackingService, 'update').resolves() @@ -136,6 +143,11 @@ describe('Fog platform service', () => { expect(IofogService._handleRouterCertificates).to.have.been.calledOnce expect(NatsService.ensureNatsForFogPhased).to.have.been.calledOnce expect(RouterService.updateRouter).to.have.been.calledOnce + expect(RouterService.buildFreshRouterMicroserviceConfig).to.have.been.calledOnceWith( + router.id, + spec.containerEngine, + transaction + ) expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledOnce expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith( fogUuid, @@ -156,6 +168,7 @@ describe('Fog platform service', () => { await FogPlatformService.reconcileFog(fogUuid) expect(RouterService.updateRouter).to.have.been.calledTwice + expect(RouterService.buildFreshRouterMicroserviceConfig).to.have.been.calledTwice expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledTwice }) @@ -213,6 +226,59 @@ describe('Fog platform service', () => { }, transaction) }) + context('when persisted router endpoints change during reconcile', () => { + const mutableRouter = { + id: 11, + iofogUuid: fogUuid, + isEdge: true, + host: '10.0.0.1', + messagingPort: 5671, + interRouterPort: null, + edgeRouterPort: null + } + + beforeEach(() => { + const hostChangedSpec = { + ...spec, + host: '10.0.0.2' + } + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid, + generation: 2, + spec: hostChangedSpec + }) + FogManager.findOneWithTags.resolves({ ...fog, host: '10.0.0.2' }) + RouterManager.findOne.callsFake((query) => { + if (query && query.isDefault) { + return Promise.resolve({ id: 1, iofogUuid: 'default', isDefault: true }) + } + return Promise.resolve(mutableRouter) + }) + RouterService.updateRouter.callsFake(async (_router, updates) => { + Object.assign(mutableRouter, updates) + return mutableRouter + }) + RouterConnectionManager.findAllWithRouters.callsFake((query) => { + if (query && query.destRouter === mutableRouter.id) { + return Promise.resolve([{ source: { iofogUuid: 'edge-downstream' } }]) + } + if (query && query.sourceRouter === mutableRouter.id) { + return Promise.resolve([]) + } + return Promise.resolve([]) + }) + }) + + it('enqueues downstream fog platform reconcile after upstream finalize', async () => { + await FogPlatformService.reconcileFog(fogUuid) + + expect(ReconcileOutboxManager.enqueueFogPlatform).to.have.been.calledWith({ + fogUuid: 'edge-downstream', + reason: 'spec-changed' + }, transaction) + }) + }) + context('when upstreamRouters is omitted from spec', () => { it('passes undefined to validateAndReturnUpstreamRouters on first create', async () => { RouterManager.findOne.callsFake((query) => { @@ -279,6 +345,75 @@ describe('Fog platform service', () => { ) }) }) + + context('when upstreamNatsServers is omitted from spec', () => { + it('preserves existing NATS upstream connections during reconcile prepare', async () => { + NatsInstanceManager.findOne.resolves({ id: 1, isHub: true, iofogUuid: 'hub' }) + NatsInstanceManager.findByFog.resolves({ id: 5, isLeaf: true, iofogUuid: fogUuid }) + NatsConnectionManager.findAllWithNats.resolves([ + { dest: { id: 1, isHub: true, iofogUuid: 'hub' } } + ]) + + await FogPlatformService.reconcileFog(fogUuid) + + expect(NatsService.ensureNatsForFogPhased).to.have.been.calledWith( + sinon.match.any, + sinon.match.has('upstreamNatsServers', ['default-nats-hub']) + ) + }) + }) + }) + + describe('.getDownstreamFogUuidsForUpstream()', () => { + it('returns downstream fogs connected via router and NATS upstream', async () => { + $sandbox.stub(RouterManager, 'findOne').resolves({ id: 10, iofogUuid: fogUuid }) + $sandbox.stub(RouterConnectionManager, 'findAllWithRouters').resolves([ + { source: { iofogUuid: 'edge-downstream' } } + ]) + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves({ id: 20, iofogUuid: fogUuid }) + $sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([ + { source: { iofogUuid: 'leaf-downstream' } } + ]) + + const result = await FogPlatformService.getDownstreamFogUuidsForUpstream(fogUuid, transaction) + + expect(result).to.have.members(['edge-downstream', 'leaf-downstream']) + }) + }) + + describe('.endpointsChanged()', () => { + it('detects host and port drift', () => { + expect(FogPlatformService.endpointsChanged( + { host: '10.0.0.1', routerHost: '10.0.0.1', messagingPort: '5671' }, + { host: '10.0.0.2', routerHost: '10.0.0.2', messagingPort: '5671' } + )).to.equal(true) + expect(FogPlatformService.endpointsChanged( + { host: '10.0.0.1', routerHost: '10.0.0.1', messagingPort: '5671' }, + { host: '10.0.0.2', routerHost: '10.0.0.1', messagingPort: '5671' } + )).to.equal(true) + expect(FogPlatformService.endpointsChanged( + { host: '10.0.0.1', routerHost: '10.0.0.1', messagingPort: '5671' }, + { host: '10.0.0.1', routerHost: '10.0.0.1', messagingPort: '5671' } + )).to.equal(false) + }) + }) + + describe('.resolveNatsConfigFromSpec()', () => { + it('preserves existing NATS upstream connections when spec omits upstreamNatsServers', async () => { + $sandbox.stub(NatsInstanceManager, 'findOne').resolves({ id: 1, isHub: true, iofogUuid: 'hub' }) + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves({ id: 5, iofogUuid: fogUuid, isLeaf: true }) + $sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([ + { dest: { id: 1, iofogUuid: 'hub', isHub: true } } + ]) + + const result = await FogPlatformService.resolveNatsConfigFromSpec( + fogUuid, + { natsMode: 'leaf' }, + transaction + ) + + expect(result.upstreamNatsServers).to.eql(['default-nats-hub']) + }) }) describe('.markReconcileFailed()', () => { diff --git a/test/src/services/iofog-service.test.js b/test/src/services/iofog-service.test.js index 41c0eb5d..ff4660a4 100644 --- a/test/src/services/iofog-service.test.js +++ b/test/src/services/iofog-service.test.js @@ -114,6 +114,8 @@ function stubUpdateFogDeps (sandbox, oldFog) { sandbox.stub(FogPlatformSpecManager, 'upsertSpec').resolves({ fogUuid: oldFog.uuid, generation: 2 }) sandbox.stub(FogPlatformStatusManager, 'ensurePending').resolves() sandbox.stub(ReconcileOutboxManager, 'enqueueFogPlatform').resolves() + sandbox.stub(NatsInstanceManager, 'findByFog').resolves(null) + sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([]) } describe('ioFog Service', () => { @@ -315,6 +317,44 @@ describe('ioFog Service', () => { .to.be.rejectedWith('Agent Resource Name is immutable') }) + context('when upstream endpoint changes', () => { + beforeEach(() => { + RouterManager.findOne.callsFake((query) => { + if (query && query.iofogUuid === uuid) { + return Promise.resolve({ id: 10, iofogUuid: uuid, messagingPort: 5671, host: '1.2.3.4' }) + } + return Promise.resolve({ id: 1, isDefault: true }) + }) + RouterConnectionManager.findAllWithRouters.callsFake((query) => { + if (query && query.destRouter === 10) { + return Promise.resolve([{ source: { iofogUuid: 'edge-downstream' } }]) + } + return Promise.resolve([]) + }) + NatsInstanceManager.findByFog.resolves({ id: 20, iofogUuid: uuid, serverPort: 4222 }) + NatsConnectionManager.findAllWithNats.resolves([]) + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: uuid, + generation: 1, + spec: { + routerMode: 'edge', + natsMode: 'leaf', + host: '1.2.3.4', + messagingPort: 5671 + } + }) + }) + + it('does not enqueue downstream platform reconcile on PATCH', async () => { + await $subject + + expect(ReconcileOutboxManager.enqueueFogPlatform).to.not.have.been.calledWith({ + fogUuid: 'edge-downstream', + reason: 'spec-changed' + }, transaction) + }) + }) + context('when fog is not found', () => { beforeEach(() => { ioFogManager.findOne.resolves(null) diff --git a/test/src/services/router-service.test.js b/test/src/services/router-service.test.js index 4a6f72e7..48a3483a 100644 --- a/test/src/services/router-service.test.js +++ b/test/src/services/router-service.test.js @@ -373,6 +373,32 @@ describe('Router Service', () => { const updatedConfig = JSON.parse(MicroserviceManager.update.firstCall.args[1].config) expect(updatedConfig.bridges).to.eql(preservedBridges) }) + + it('persists router config when upstream connector fingerprint changes', async () => { + MicroserviceManager.findOne.resolves({ + id: 1, + uuid: 'routerMsvcUuid', + iofogUuid: router.iofogUuid, + catalogItemId: routerCatalogItem.id, + config: JSON.stringify({ + connectors: { + 'old-upstream': { + name: 'old-upstream', + host: '10.0.0.9', + port: '55671', + role: 'edge', + sslProfile: 'router-site-server-test-fog' + } + } + }) + }) + + await RouterService.updateConfig(routerID, containerEngine, transaction) + + expect(MicroserviceManager.update).to.have.been.called + const updatedConfig = JSON.parse(MicroserviceManager.update.firstCall.args[1].config) + expect(updatedConfig.connectors).to.not.have.property('old-upstream') + }) }) describe('.updateRouter', () => { From 415bb2d91a33732c1a7dbbce274f9300126d2603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:42 +0300 Subject: [PATCH 20/32] Harden multi-replica WebSocket exec and log orphan session cleanup. Tear down sessions only when both sides are disconnected, reconcile stale DB rows with no live sockets, expose agent sessions with userConnected only, and raise per-resource concurrent session limits to 5. --- src/config/config.yaml | 20 +- src/jobs/ws-session-reconcile-job.js | 66 +++++- src/services/agent-service.js | 9 +- src/websocket/server.js | 213 ++++++++++++++---- test/src/websocket/ws-session-orphan.test.js | 216 +++++++++++++++++++ 5 files changed, 465 insertions(+), 59 deletions(-) create mode 100644 test/src/websocket/ws-session-orphan.test.js diff --git a/src/config/config.yaml b/src/config/config.yaml index c4b97789..53851641 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -23,15 +23,15 @@ server: timeout: 3600000 # Legacy idle fallback (ms); exec uses execMaxDurationMs maxConnections: 100 # Maximum connections per session cleanupInterval: 30000 # Session cleanup interval (30 seconds) - execPendingTimeoutMs: 60000 # Exec: user wait for agent (R81) - execMaxDurationMs: 28800000 # Exec: max active session 8h (R81) - execMaxConcurrentPerResource: 3 # Exec: max user WS per MS (R92) - logPendingTimeoutMs: 120000 # Log: user wait for agent (R82) - logIdleTimeoutMs: 7200000 # Log: idle session 2h (R82) - logMaxConcurrentPerResource: 3 # Log: max user WS per MS or fog (R82) - logTailMaxLines: 5000 # Log: tail query param max (R82) - replicaMaxConcurrentWs: 500 # Scale SLO per replica (R88) - drainTimeoutMs: 30000 # Graceful drain on SIGTERM/preStop (R85) + execPendingTimeoutMs: 60000 # Exec: user wait for agen + execMaxDurationMs: 28800000 # Exec: max active session 8h + execMaxConcurrentPerResource: 5 # Exec: max user WS per MS + logPendingTimeoutMs: 120000 # Log: user wait for agent + logIdleTimeoutMs: 7200000 # Log: idle session 2h + logMaxConcurrentPerResource: 5 # Log: max user WS per MS or fog + logTailMaxLines: 5000 # Log: tail query param max + replicaMaxConcurrentWs: 500 # Scale SLO per replica + drainTimeoutMs: 30000 # Graceful drain on SIGTERM/preStop relay: amqp: poolSize: 8 @@ -42,7 +42,7 @@ server: maxPendingMessages: 8192 publishTimeoutMs: 5000 ha: - crossReplicaRequiresAmqp: true # Cross-replica exec/log relay requires AMQP router (R84) + crossReplicaRequiresAmqp: true # Cross-replica exec/log relay requires AMQP router failFastOnRouterUnavailable: true security: maxConnectionsPerIp: 10 diff --git a/src/jobs/ws-session-reconcile-job.js b/src/jobs/ws-session-reconcile-job.js index c5a0a07a..7b9a7b2e 100644 --- a/src/jobs/ws-session-reconcile-job.js +++ b/src/jobs/ws-session-reconcile-job.js @@ -57,6 +57,27 @@ async function reconcileStaleSessionsInTransaction (transaction) { if (execSessionManager.getExecSession(sessionId)) continue + if (!row.userConnected && !row.agentConnected) { + await MicroserviceExecSessionManager.deleteBySessionId(sessionId, transaction) + + const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceExecSessions, + transaction + ) + } + + execCleaned++ + logger.info('Reconciled orphaned exec session row:' + JSON.stringify({ + sessionId, + microserviceUuid, + status: row.status + })) + continue + } + const age = now - new Date(row.updatedAt).getTime() const threshold = row.status === 'PENDING' ? execPendingTimeout : execMaxDuration if (age < threshold) continue @@ -88,6 +109,27 @@ async function reconcileStaleSessionsInTransaction (transaction) { for (const row of msLogRows) { if (logSessionManager.getLogSession(row.sessionId)) continue + if (!row.userConnected && !row.agentConnected) { + await MicroserviceLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const microservice = await MicroserviceManager.findOne({ uuid: row.microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceLogs, + transaction + ) + } + + logger.info('Reconciled orphaned microservice log row:' + JSON.stringify({ + sessionId: row.sessionId, + microserviceUuid: row.microserviceUuid, + status: row.status + })) + continue + } + const age = now - new Date(row.updatedAt).getTime() const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout if (age < threshold) continue @@ -119,6 +161,27 @@ async function reconcileStaleSessionsInTransaction (transaction) { for (const row of fogLogRows) { if (logSessionManager.getLogSession(row.sessionId)) continue + if (!row.userConnected && !row.agentConnected) { + await FogLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const fog = await FogManager.findOne({ uuid: row.iofogUuid }, transaction) + if (fog) { + await ChangeTrackingService.update( + fog.uuid, + ChangeTrackingService.events.fogLogs, + transaction + ) + } + + logger.info('Reconciled orphaned fog log row:' + JSON.stringify({ + sessionId: row.sessionId, + iofogUuid: row.iofogUuid, + status: row.status + })) + continue + } + const age = now - new Date(row.updatedAt).getTime() const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout if (age < threshold) continue @@ -158,5 +221,6 @@ async function reconcileStaleSessions () { } module.exports = { - run + run, + reconcileStaleSessionsInTransaction } diff --git a/src/services/agent-service.js b/src/services/agent-service.js index e30a76ca..ffd4f65e 100644 --- a/src/services/agent-service.js +++ b/src/services/agent-service.js @@ -645,7 +645,8 @@ const getAgentLogSessions = async function (fog, transaction) { const msSessions = await MicroserviceLogStatusManager.findAll( { microserviceUuid: { [Op.in]: microserviceUuids }, - status: { [Op.in]: ['PENDING', 'ACTIVE'] } + status: { [Op.in]: ['PENDING', 'ACTIVE'] }, + userConnected: true }, transaction ) @@ -664,7 +665,8 @@ const getAgentLogSessions = async function (fog, transaction) { const fogSessions = await FogLogStatusManager.findAll( { iofogUuid: fog.uuid, - status: { [Op.in]: ['PENDING', 'ACTIVE'] } + status: { [Op.in]: ['PENDING', 'ACTIVE'] }, + userConnected: true }, transaction ) @@ -697,7 +699,8 @@ const getAgentExecSessions = async function (fog, transaction) { const msSessions = await MicroserviceExecSessionManager.findAll( { microserviceUuid: { [Op.in]: microserviceUuids }, - status: { [Op.in]: ['PENDING', 'ACTIVE'] } + status: { [Op.in]: ['PENDING', 'ACTIVE'] }, + userConnected: true }, transaction ) diff --git a/src/websocket/server.js b/src/websocket/server.js index cab2466d..4dc3b83f 100644 --- a/src/websocket/server.js +++ b/src/websocket/server.js @@ -28,6 +28,8 @@ const FogLogStatusManager = require('../data/managers/fog-log-status-manager') const ChangeTrackingService = require('../services/change-tracking-service') const FogManager = require('../data/managers/iofog-manager') const FogStates = require('../enums/fog-state') +const Sequelize = require('sequelize') +const Op = Sequelize.Op const MESSAGE_TYPES = { STDIN: 0, @@ -376,11 +378,11 @@ class WebSocketServer { } getLogConcurrencyLimit () { - return this.sessionConfig.logMaxConcurrentPerResource || 3 + return this.sessionConfig.logMaxConcurrentPerResource || 5 } getExecConcurrencyLimit () { - return this.sessionConfig.execMaxConcurrentPerResource || 3 + return this.sessionConfig.execMaxConcurrentPerResource || 5 } getLogTailMaxLines () { @@ -832,6 +834,52 @@ class WebSocketServer { return !!(row && row.agentConnected) } + async _checkLogUserConnectedInDb (sessionId, microserviceUuid, fogUuid, transaction) { + let row = null + if (microserviceUuid) { + row = await MicroserviceLogStatusManager.findOne({ sessionId }, transaction) + } else if (fogUuid) { + row = await FogLogStatusManager.findOne({ sessionId }, transaction) + } + return !!(row && row.userConnected) + } + + async _isLogUserStillConnected (sessionId, session, microserviceUuid, fogUuid) { + if (session && session.user) { + return true + } + try { + return await transactionRunner.runInTransaction( + (tx) => this._checkLogUserConnectedInDb(sessionId, microserviceUuid, fogUuid, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.log.user-connected-db-check' } + ) + } catch (error) { + logger.warn('Log user-connected DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + return false + } + } + + async _isExecUserStillConnected (sessionId, session) { + if (session && session.user) { + return true + } + try { + return await transactionRunner.runInTransaction( + (tx) => this._checkExecUserConnectedInDb(sessionId, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.exec.user-connected-db-check' } + ) + } catch (error) { + logger.warn('Exec user-connected DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + return false + } + } + async _notifyExecRemotePeerClose (sessionId, session, reason = 'Exec session expired') { if (!session || !this.relayTransport.shouldUseRelay(sessionId)) { return @@ -1167,6 +1215,14 @@ class WebSocketServer { } })() + if (currentSession.agent) { + if (currentSession.agent.readyState === WebSocket.OPEN) { + currentSession.agent.close(1000, 'User closed connection') + } + await this._cleanupExecSessionInTransaction(sessionId) + return + } + const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) if (relayEnabled) { try { @@ -1186,15 +1242,88 @@ class WebSocketServer { error: error.message }) } - } else if (currentSession.agent && currentSession.agent.readyState === WebSocket.OPEN) { - currentSession.agent.close(1000, 'User closed connection') } - if (!currentSession.agent) { + let agentStillConnected = false + try { + agentStillConnected = await transactionRunner.runInTransaction( + (tx) => this._checkExecAgentPairedInDb(sessionId, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.exec.user-partial-agent-db-check' } + ) + } catch (error) { + logger.warn('Exec user partial disconnect agent DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + + if (!agentStillConnected) { + await this._cleanupExecSessionInTransaction(sessionId) + } else { await this._detachExecSessionLocal(sessionId) } } + async _handleUserLogPartialDisconnect (sessionId, session, microserviceUuid, fogUuid) { + session.user = null + session.remoteUserPaired = false + session.lastActivity = Date.now() + + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + if (microserviceUuid) { + await MicroserviceLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } else if (fogUuid) { + await FogLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } + + const fogForTracking = await FogManager.findOne({ + uuid: fogUuid || (await MicroserviceManager.findOne({ uuid: microserviceUuid }, closeTransaction)).iofogUuid + }, closeTransaction) + if (fogForTracking) { + await ChangeTrackingService.update( + fogForTracking.uuid, + fogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, + closeTransaction + ) + } + })() + + if (session.agent) { + if (session.agent.readyState === WebSocket.OPEN) { + session.agent.close(1000, 'User closed connection') + } + await this._cleanupLogSessionInTransaction(sessionId) + return + } + + let agentStillConnected = false + try { + agentStillConnected = await transactionRunner.runInTransaction( + (tx) => this._checkLogAgentPairedInDb(sessionId, microserviceUuid, fogUuid, tx), + { priority: PRIORITY_BACKGROUND, label: 'ws.log.user-partial-agent-db-check' } + ) + } catch (error) { + logger.warn('Log user partial disconnect agent DB check failed:' + JSON.stringify({ + sessionId, + error: error.message + })) + } + + if (!agentStillConnected) { + await this._cleanupLogSessionInTransaction(sessionId) + } else { + await this._detachLogSessionLocal(sessionId) + } + } + async _handleAgentLogPartialDisconnect (sessionId, session, { microserviceUuid, iofogUuid, logStatus }) { session.agent = null session.remoteAgentPaired = false @@ -1249,12 +1378,22 @@ class WebSocketServer { } async countLogSessionsInDb (microserviceUuid, fogUuid, transaction) { + const activeUserFilter = { + userConnected: true, + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + } if (microserviceUuid) { - const rows = await MicroserviceLogStatusManager.findAll({ microserviceUuid }, transaction) + const rows = await MicroserviceLogStatusManager.findAll({ + microserviceUuid, + ...activeUserFilter + }, transaction) return rows.length } if (fogUuid) { - const rows = await FogLogStatusManager.findAll({ iofogUuid: fogUuid }, transaction) + const rows = await FogLogStatusManager.findAll({ + iofogUuid: fogUuid, + ...activeUserFilter + }, transaction) return rows.length } return 0 @@ -1264,7 +1403,11 @@ class WebSocketServer { if (!microserviceUuid) { return 0 } - const rows = await MicroserviceExecSessionManager.findAll({ microserviceUuid }, transaction) + const rows = await MicroserviceExecSessionManager.findAll({ + microserviceUuid, + userConnected: true, + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) return rows.length } @@ -1926,9 +2069,9 @@ class WebSocketServer { ws.on('close', async (code, reason) => { const currentSession = this.execSessionManager.getExecSession(sessionId) if (currentSession) { - const relayEnabled = this.relayTransport.shouldUseRelay(sessionId) try { - if (currentSession.user != null || relayEnabled) { + const userStillConnected = await this._isExecUserStillConnected(sessionId, currentSession) + if (userStillConnected) { await this._handleAgentExecPartialDisconnect(sessionId, currentSession, fog) } else { await this._cleanupExecSessionInTransaction(sessionId) @@ -2759,41 +2902,19 @@ class WebSocketServer { } } - if (agentConnected) { - await TransactionDecorator.generateTransaction(async (closeTransaction) => { - if (microserviceUuid) { - await MicroserviceLogStatusManager.update( - { sessionId }, - { userConnected: false }, - closeTransaction - ) - } else if (fogUuid) { - await FogLogStatusManager.update( - { sessionId }, - { userConnected: false }, - closeTransaction - ) - } - - const fogForTracking = await FogManager.findOne({ - uuid: fogUuid || (await MicroserviceManager.findOne({ uuid: microserviceUuid }, closeTransaction)).iofogUuid - }, closeTransaction) - await ChangeTrackingService.update( - fogForTracking.uuid, - fogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, - closeTransaction - ) - })() + if (agentStillConnected) { + await this._handleUserLogPartialDisconnect( + sessionId, + session, + microserviceUuid, + fogUuid + ) logger.info('Log session user disconnected (agent still connected):' + JSON.stringify({ sessionId, microserviceUuid: microserviceUuid || null, fogUuid: fogUuid || null, closeCode: code })) - session.remoteAgentPaired = false - if (!session.agent) { - await this._detachLogSessionLocal(sessionId) - } } else { logger.info('Log session user disconnected (full cleanup):' + JSON.stringify({ sessionId, @@ -3030,17 +3151,19 @@ class WebSocketServer { ws.on('close', async (code, reason) => { const session = this.logSessionManager.getLogSession(sessionId) if (session) { - const relayEnabled = this.relayTransport.shouldUseRelayForLogs(sessionId) - const partialDisconnect = session.user != null || relayEnabled - try { - if (partialDisconnect) { + const userStillConnected = await this._isLogUserStillConnected( + sessionId, + session, + microserviceUuid, + iofogUuid + ) + if (userStillConnected) { logger.info('Log session agent disconnected (partial detach):' + JSON.stringify({ sessionId, microserviceUuid: microserviceUuid || null, fogUuid: iofogUuid || null, - userConnected: session.user != null, - relayEnabled, + userConnected: true, closeCode: code })) await this._handleAgentLogPartialDisconnect(sessionId, session, { diff --git a/test/src/websocket/ws-session-orphan.test.js b/test/src/websocket/ws-session-orphan.test.js new file mode 100644 index 00000000..5f61024f --- /dev/null +++ b/test/src/websocket/ws-session-orphan.test.js @@ -0,0 +1,216 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const WebSocketServerClass = require('../../../src/websocket/server') +const Sequelize = require('sequelize') +const Op = Sequelize.Op +const MicroserviceLogStatusManager = require('../../../src/data/managers/microservice-log-status-manager') +const MicroserviceExecSessionManager = require('../../../src/data/managers/microservice-exec-session-manager') +const FogLogStatusManager = require('../../../src/data/managers/fog-log-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const EventService = require('../../../src/services/event-service') +const { reconcileStaleSessionsInTransaction } = require('../../../src/jobs/ws-session-reconcile-job') +const agentService = require('../../../src/services/agent-service') +const { + createMockWebSocket, + createMockRequest, + createMockNatsRelayTransport, + resetWebSocketServerSingleton, + newTestIds, + delay +} = require('../../support/ws-session-harness') +const { resetTransportForTests } = require('../../../src/services/ws-relay-transport-factory') + +describe('WebSocket session orphan cleanup', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + + let wsServer + let transaction + + beforeEach(() => { + resetTransportForTests() + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + transaction = { fakeTransaction: true } + $sandbox.stub(EventService, 'createWsConnectEvent').resolves() + $sandbox.stub(EventService, 'createWsDisconnectEvent').resolves() + }) + + afterEach(() => { + $sandbox.restore() + resetTransportForTests() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('countLogSessionsInDb counts only userConnected PENDING/ACTIVE rows', async () => { + const findAll = $sandbox.stub(MicroserviceLogStatusManager, 'findAll').resolves([ + { sessionId: 'live-1' } + ]) + + const count = await wsServer.countLogSessionsInDb($ids.microserviceUuid, null, transaction) + + expect(count).to.equal(1) + expect(findAll).to.have.been.calledOnce + const query = findAll.firstCall.args[0] + expect(query.microserviceUuid).to.equal($ids.microserviceUuid) + expect(query.userConnected).to.equal(true) + expect(query.status[Op.in]).to.deep.equal(['PENDING', 'ACTIVE']) + }) + + it('countExecSessionsInDb counts only userConnected PENDING/ACTIVE rows', async () => { + const findAll = $sandbox.stub(MicroserviceExecSessionManager, 'findAll').resolves([]) + + await wsServer.countExecSessionsInDb($ids.microserviceUuid, transaction) + + expect(findAll).to.have.been.calledOnce + const query = findAll.firstCall.args[0] + expect(query.userConnected).to.equal(true) + expect(query.status[Op.in]).to.deep.equal(['PENDING', 'ACTIVE']) + }) + + it('full-cleans log session when agent disconnects after user already left', async () => { + const logRow = { + sessionId: $ids.sessionId, + microserviceUuid: $ids.microserviceUuid, + iofogUuid: null, + tailConfig: JSON.stringify({ lines: 100, follow: true, since: null, until: null }), + agentConnected: true, + userConnected: false + } + + wsServer.relayTransport = createMockNatsRelayTransport() + + $sandbox.stub(MicroserviceLogStatusManager, 'findOne').callsFake(async () => ({ ...logRow })) + $sandbox.stub(MicroserviceLogStatusManager, 'update').callsFake(async (_where, patch) => { + Object.assign(logRow, patch) + }) + $sandbox.stub(MicroserviceLogStatusManager, 'delete').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(wsServer, 'validateAgentLogsConnection').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(wsServer, 'cleanupLogSession').resolves() + + const agentWs = createMockWebSocket() + const agentReq = createMockRequest( + `/api/v3/agent/logs/microservice/${$ids.microserviceUuid}/${$ids.sessionId}` + ) + agentReq.headers.authorization = 'Bearer fog-token' + + await wsServer.handleAgentLogsConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + null, + $ids.sessionId, + transaction + ) + await delay(20) + + MicroserviceLogStatusManager.update.resetHistory() + agentWs.close(1006) + await delay(50) + + expect(wsServer.cleanupLogSession).to.have.been.calledOnceWith($ids.sessionId, sinon.match.object) + expect(MicroserviceLogStatusManager.update).to.not.have.been.called + }) + + it('full-cleans exec session when agent disconnects after user already left', async () => { + const execRow = { + sessionId: $ids.sessionId, + microserviceUuid: $ids.microserviceUuid, + status: 'ACTIVE', + userConnected: false, + agentConnected: true + } + + wsServer.relayTransport = createMockNatsRelayTransport() + + $sandbox.stub(MicroserviceExecSessionManager, 'findBySessionId').callsFake(async () => ({ ...execRow })) + $sandbox.stub(MicroserviceExecSessionManager, 'update').callsFake(async (_where, patch) => { + Object.assign(execRow, patch) + }) + $sandbox.stub(MicroserviceExecSessionManager, 'deleteBySessionId').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(wsServer, 'validateAgentExecConnection').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(wsServer, 'cleanupExecSession').resolves() + + const agentWs = createMockWebSocket() + const agentReq = createMockRequest( + `/api/v3/agent/exec/microservice/${$ids.microserviceUuid}/${$ids.sessionId}` + ) + agentReq.headers.authorization = 'Bearer fog-token' + + await wsServer.handleAgentExecConnection( + agentWs, + agentReq, + 'Bearer fog-token', + $ids.microserviceUuid, + $ids.sessionId, + transaction + ) + await delay(20) + + MicroserviceExecSessionManager.update.resetHistory() + agentWs.close(1006) + await delay(50) + + expect(wsServer.cleanupExecSession).to.have.been.calledOnceWith($ids.sessionId, sinon.match.object) + expect(MicroserviceExecSessionManager.update).to.not.have.been.called + }) + + it('getAgentLogSessions omits rows without a connected user', async () => { + const fog = { uuid: $ids.fogUuid } + $sandbox.stub(MicroserviceManager, 'findAll').resolves([{ uuid: $ids.microserviceUuid }]) + $sandbox.stub(MicroserviceLogStatusManager, 'findAll').resolves([ + { + microserviceUuid: $ids.microserviceUuid, + sessionId: 'live-session', + tailConfig: JSON.stringify({ lines: 100, follow: true }), + status: 'ACTIVE', + agentConnected: false + } + ]) + $sandbox.stub(FogLogStatusManager, 'findAll').resolves([]) + + await agentService.getAgentLogSessions(fog, transaction) + + const query = MicroserviceLogStatusManager.findAll.firstCall.args[0] + expect(query.userConnected).to.equal(true) + }) + + it('reconcile deletes orphaned log rows with both sides disconnected', async () => { + const wsInstance = wsServer + $sandbox.stub(WebSocketServerClass, 'getInstance').returns(wsInstance) + + $sandbox.stub(MicroserviceExecSessionManager, 'findAll').resolves([]) + $sandbox.stub(MicroserviceLogStatusManager, 'findAll').resolves([ + { + sessionId: 'orphan-log', + microserviceUuid: $ids.microserviceUuid, + status: 'ACTIVE', + userConnected: false, + agentConnected: false, + updatedAt: new Date(Date.now() - 1000) + } + ]) + $sandbox.stub(FogLogStatusManager, 'findAll').resolves([]) + $sandbox.stub(MicroserviceLogStatusManager, 'delete').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + + await reconcileStaleSessionsInTransaction(transaction) + + expect(MicroserviceLogStatusManager.delete).to.have.been.calledOnceWith( + { sessionId: 'orphan-log' }, + transaction + ) + }) +}) From 421d8d3bfb26829f3a8e1527e93f62e59a14158c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:47 +0300 Subject: [PATCH 21/32] Recreate Kubernetes services when update races a missing resource. Extract shared service spec builders, fall back to create on 404 during patch, and broaden client-node 404/409 detection for string error bodies. --- src/services/services-service.js | 123 ++++++++++++--------- src/utils/k8s-client.js | 22 +++- test/src/services/services-service.test.js | 12 ++ test/src/utils/k8s-client.test.js | 48 ++++++++ 4 files changed, 149 insertions(+), 56 deletions(-) create mode 100644 test/src/utils/k8s-client.test.js diff --git a/src/services/services-service.js b/src/services/services-service.js index bd5cf1c8..a278a9ac 100644 --- a/src/services/services-service.js +++ b/src/services/services-service.js @@ -484,69 +484,86 @@ function _getK8sServiceLabels () { } } +// Helper function to build Kubernetes Service spec for create +function _buildK8sServiceSpec (serviceConfig) { + const normalizedTags = (serviceConfig.tags || []).map(tag => tag.includes(':') ? tag : `${tag}:`) + const componentLabelKey = getComponentLabelKey() + return { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: serviceConfig.name, + labels: _getK8sServiceLabels(), + annotations: normalizedTags.reduce((acc, tag) => { + const [key, value] = tag.split(':') + acc[key] = (value || '').trim() + return acc + }, {}) + }, + spec: { + type: serviceConfig.k8sType, + selector: { + [componentLabelKey]: 'router' + }, + ports: [{ + name: 'iofog-service', + targetPort: parseInt(serviceConfig.bridgePort), + port: parseInt(serviceConfig.servicePort), + protocol: 'TCP' + }] + } + } +} + +function _buildK8sServicePatchData (serviceConfig) { + const normalizedTags = (serviceConfig.tags || []).map(tag => tag.includes(':') ? tag : `${tag}:`) + const componentLabelKey = getComponentLabelKey() + return { + metadata: { + labels: _getK8sServiceLabels(), + annotations: normalizedTags.reduce((acc, tag) => { + const [key, value] = tag.split(':') + acc[key] = (value || '').trim() + return acc + }, {}) + }, + spec: { + type: serviceConfig.k8sType, + selector: { + [componentLabelKey]: 'router' + }, + ports: [{ + name: 'iofog-service', + port: parseInt(serviceConfig.servicePort), + targetPort: parseInt(serviceConfig.bridgePort), + protocol: 'TCP' + }] + } + } +} + // Helper function to create or update a Kubernetes service resource (I/O only; no DB). // Returns LoadBalancer IP when assigned, otherwise null. async function _syncK8sServiceResource (serviceConfig) { const existingService = await K8sClient.getService(serviceConfig.name, { ignoreNotFound: true }) + const serviceSpec = _buildK8sServiceSpec(serviceConfig) + if (!existingService) { logger.debug(`Service not found: ${serviceConfig.name}, creating new service`) - const normalizedTags = serviceConfig.tags.map(tag => tag.includes(':') ? tag : `${tag}:`) - const componentLabelKey = getComponentLabelKey() - const serviceSpec = { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: serviceConfig.name, - labels: _getK8sServiceLabels(), - annotations: normalizedTags.reduce((acc, tag) => { - const [key, value] = tag.split(':') - acc[key] = (value || '').trim() - return acc - }, {}) - }, - spec: { - type: serviceConfig.k8sType, - selector: { - [componentLabelKey]: 'router' - }, - ports: [{ - name: 'iofog-service', - targetPort: parseInt(serviceConfig.bridgePort), - port: parseInt(serviceConfig.servicePort), - protocol: 'TCP' - }] - } - } - await K8sClient.createService(serviceSpec) } else { - const normalizedTags = serviceConfig.tags.map(tag => tag.includes(':') ? tag : `${tag}:`) - const componentLabelKey = getComponentLabelKey() - const patchData = { - metadata: { - labels: _getK8sServiceLabels(), - annotations: normalizedTags.reduce((acc, tag) => { - const [key, value] = tag.split(':') - acc[key] = (value || '').trim() - return acc - }, {}) - }, - spec: { - type: serviceConfig.k8sType, - selector: { - [componentLabelKey]: 'router' - }, - ports: [{ - name: 'iofog-service', - port: parseInt(serviceConfig.servicePort), - targetPort: parseInt(serviceConfig.bridgePort), - protocol: 'TCP' - }] + const patchData = _buildK8sServicePatchData(serviceConfig) + logger.debug(`Updating service: ${serviceConfig.name}`) + try { + await K8sClient.updateService(serviceConfig.name, patchData) + } catch (error) { + if (K8sClient.isK8sNotFound(error)) { + logger.warn(`Service ${serviceConfig.name} missing during update, creating new service`) + await K8sClient.createService(serviceSpec) + } else { + throw error } } - - logger.debug(`Updating service: ${serviceConfig.name}`) - await K8sClient.updateService(serviceConfig.name, patchData) } if (serviceConfig.k8sType === 'LoadBalancer') { diff --git a/src/utils/k8s-client.js b/src/utils/k8s-client.js index afbd7597..3060aef8 100644 --- a/src/utils/k8s-client.js +++ b/src/utils/k8s-client.js @@ -55,14 +55,29 @@ async function getK8sAppsApi () { return k8sAppsApi } +function _parseK8sErrorBody (body) { + if (body == null) { + return body + } + if (typeof body === 'string') { + try { + return JSON.parse(body) + } catch (_) { + return body + } + } + return body +} + /** * Returns true if the error indicates a Kubernetes 404 Not Found. - * Handles both axios-style (error.response.status) and body.code/body.reason. + * Handles ApiException (client-node v1), axios-style, and Status body payloads. */ function isK8sNotFound (error) { if (!error) return false + if (error.code === 404) return true if (error.response && error.response.status === 404) return true - const body = error.body || (error.response && error.response.body) + const body = _parseK8sErrorBody(error.body || (error.response && error.response.body)) if (body && (body.code === 404 || body.reason === 'NotFound')) return true return false } @@ -72,8 +87,9 @@ function isK8sNotFound (error) { */ function isK8sConflict (error) { if (!error) return false + if (error.code === 409) return true if (error.response && error.response.status === 409) return true - const body = error.body || (error.response && error.response.body) + const body = _parseK8sErrorBody(error.body || (error.response && error.response.body)) if (body && (body.code === 409 || body.reason === 'Conflict')) return true return false } diff --git a/test/src/services/services-service.test.js b/test/src/services/services-service.test.js index 89f9b7d8..f7823228 100644 --- a/test/src/services/services-service.test.js +++ b/test/src/services/services-service.test.js @@ -459,6 +459,7 @@ spec: $sandbox.stub(K8sClient, 'getService').resolves(null) $sandbox.stub(K8sClient, 'createService').resolves({ metadata: { name: 'snapshot-service' } }) $sandbox.stub(K8sClient, 'updateService').resolves({ metadata: { name: 'snapshot-service' } }) + $sandbox.stub(K8sClient, 'isK8sNotFound').returns(false) }) it('creates the K8s service when it does not exist', async () => { @@ -477,5 +478,16 @@ spec: expect(K8sClient.createService).to.not.have.been.called expect(K8sClient.updateService).to.have.been.calledOnceWith('snapshot-service', sinon.match.object) }) + + it('creates the K8s service when update returns not found', async () => { + K8sClient.getService.resolves({ metadata: { name: 'snapshot-service' } }) + K8sClient.updateService.rejects(new Error('not found')) + K8sClient.isK8sNotFound.returns(true) + + await ServicesService._syncK8sServiceResource(serviceConfig) + + expect(K8sClient.updateService).to.have.been.calledOnce + expect(K8sClient.createService).to.have.been.calledOnce + }) }) }) diff --git a/test/src/utils/k8s-client.test.js b/test/src/utils/k8s-client.test.js new file mode 100644 index 00000000..8e20f9e3 --- /dev/null +++ b/test/src/utils/k8s-client.test.js @@ -0,0 +1,48 @@ +'use strict' + +const { expect } = require('chai') + +const { isK8sNotFound, isK8sConflict } = require('../../../src/utils/k8s-client') + +describe('k8s-client error helpers', () => { + describe('.isK8sNotFound()', () => { + it('detects ApiException-style 404 from client-node v1', () => { + const error = { + code: 404, + message: 'Unknown API Status Code!', + body: JSON.stringify({ + kind: 'Status', + status: 'Failure', + reason: 'NotFound', + code: 404 + }) + } + expect(isK8sNotFound(error)).to.equal(true) + }) + + it('detects axios-style 404', () => { + expect(isK8sNotFound({ response: { status: 404 } })).to.equal(true) + }) + + it('detects parsed Status body', () => { + expect(isK8sNotFound({ + body: { reason: 'NotFound', code: 404 } + })).to.equal(true) + }) + + it('returns false for other errors', () => { + expect(isK8sNotFound({ code: 500 })).to.equal(false) + expect(isK8sNotFound(null)).to.equal(false) + }) + }) + + describe('.isK8sConflict()', () => { + it('detects ApiException-style 409 from client-node v1', () => { + const error = { + code: 409, + body: JSON.stringify({ reason: 'Conflict', code: 409 }) + } + expect(isK8sConflict(error)).to.equal(true) + }) + }) +}) From 3ab642027de22bffc530e79529e2814cff10ac1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:51 +0300 Subject: [PATCH 22/32] Bump embedded EdgeOps Console default version to v1.0.6. --- .env.example | 2 +- .github/actions/set-build-env/action.yml | 2 +- Dockerfile | 4 ++-- Makefile | 4 ++-- scripts/build-console-dev.js | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index e99bc01e..7639b55f 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ NODE_ENV=development # EdgeOps Console static embed (npm run build:console → dev/console/build) EDGEOPS_CONSOLE_PATH=dev/console/build # must be absolute path -EDGEOPS_CONSOLE_VERSION=v1.0.5 +EDGEOPS_CONSOLE_VERSION=v1.0.6 # EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=datasance diff --git a/.github/actions/set-build-env/action.yml b/.github/actions/set-build-env/action.yml index 13d66dbd..41efa7ef 100644 --- a/.github/actions/set-build-env/action.yml +++ b/.github/actions/set-build-env/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | VERSION="${{ env.EDGEOPS_CONSOLE_VERSION }}" - if [ -z "$VERSION" ]; then VERSION="1.0.5"; fi + if [ -z "$VERSION" ]; then VERSION="1.0.6"; fi echo "EDGEOPS_CONSOLE_VERSION=$VERSION" >> "${GITHUB_ENV}" REPO="${{ env.EDGEOPS_CONSOLE_REPO }}" diff --git a/Dockerfile b/Dockerfile index 224e4fe6..a06cd222 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM node:24-bookworm@sha256:fdddfb3e688158251943d52eba361de991548f6814007acba4917ae6b512d6be AS console-builder ARG EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console -ARG EDGEOPS_CONSOLE_VERSION=v1.0.5 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.6 ARG EDGEOPS_CONSOLE_FLAVOR=datasance RUN apt-get update \ @@ -50,7 +50,7 @@ RUN npm pack # ubi9/nodejs-24-minimal:latest — pin manifest list digest for reproducible multi-arch builds FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:cc7648f8e1c7d628e4334328a712f30ea0820787bb92836cc93e349674c689bf -ARG EDGEOPS_CONSOLE_VERSION=v1.0.5 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.6 ARG IMAGE_REGISTRY ARG OCI_SOURCE_REPO ARG CONTROLLER_DISTRIBUTION=iofog diff --git a/Makefile b/Makefile index 96124132..8c275fc2 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Local Docker build — mirrors CI/release build-args (see .github/actions/set-build-env). -# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.5 +# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.6 FLAVOR ?= datasance IMAGE_NAME ?= controller @@ -25,7 +25,7 @@ else $(error FLAVOR must be "datasance" or "iofog", got "$(FLAVOR)") endif -EDGEOPS_CONSOLE_VERSION ?= v1.0.5 +EDGEOPS_CONSOLE_VERSION ?= v1.0.6 IMAGE_REF = $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(DOCKER_TAG) diff --git a/scripts/build-console-dev.js b/scripts/build-console-dev.js index 49f567b4..09d8cf61 100644 --- a/scripts/build-console-dev.js +++ b/scripts/build-console-dev.js @@ -9,7 +9,7 @@ const CONSOLE_DIR = path.join(DEV_DIR, 'console') const BUILD_OUT = path.join(CONSOLE_DIR, 'build') const REPO = process.env.EDGEOPS_CONSOLE_REPO || 'https://github.com/Datasance/edgeops-console' -const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.5' +const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.6' const FLAVOR = process.env.EDGEOPS_CONSOLE_FLAVOR || 'datasance' function normalizeTag (version) { From 92065eafc0c317b329eff0ec954c58156cbe1379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 09:34:55 +0300 Subject: [PATCH 23/32] Document reconcile correctness and WebSocket orphan fixes in CHANGELOG. --- CHANGELOG.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cfc6032..aadd4aa0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog -## [v3.8.0] - 2026-06-17 + +## [v3.8.0] - 2026-07-03 Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is **no upgrade path** from v3.7: use a fresh database and redeploy Controller + Edgelet together. @@ -127,6 +128,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **WebSocket exec/log session cleanup race (postgres / NATS relay)** — concurrent teardown paths (pending timeout + disconnect, NATS CLOSE + CLOSE ack, relay callback + `ws.on('close')`) reused one Sequelize transaction via AsyncLocalStorage, causing `commit has been called on this transaction` on session row delete. Exec and log cleanup are deduplicated per `sessionId`, use fresh background transactions, pending timeouts only close sockets, and relay CLOSE acks no longer trigger DB teardown. - **WebSocket exec/log cross-replica pairing** — pending timeouts no longer require a local `session.agent`; user pods mark `remoteAgentPaired` via relay delivery hooks and DB fallback (`agentConnected`). Agent pods publish **ACTIVATION** (exec) and **LOG_LINE** user notifications via NATS/AMQP relay when the user is on another replica. Same-replica log “agent connected” notify uses **LOG_LINE** (not `LOG_START` + embedded message). **`ws_pending_pairings`** and **`ws_pairing_duration_ms`** metrics are recorded from user connect through pairing completion or timeout. Cross-replica paired sessions use **max/idle duration** (not pending timeout) in periodic cleanup; agent disconnect on an agent-only pod relays **CLOSE** (exec) or **LOG_LINE** (log) to the user pod and detaches local state without deleting the DB row. - **WebSocket cross-replica exec activation** — `setupExecMessageForwarding` read `shouldUseRelay` before `enableForSession`, so agent-only pods skipped relay **ACTIVATION** and user notify on first connect (log setup was already correct). Info logs added for log session user/agent disconnect, full cleanup, and local detach. +- **WebSocket exec/log orphan session cleanup (multi-replica HA)** — agent partial disconnect no longer triggers solely because relay is enabled; teardown uses DB `userConnected` (not stale `remoteUserPaired`). Full DB delete when both sides are gone. Concurrency limits and `GET /agent/logs/sessions` / `GET /agent/exec/sessions` count or list only `userConnected: true` rows. Reconcile job immediately removes rows with both flags false. Same-replica user disconnect still full-cleans when the agent socket is local. - **Volume mount manager transaction propagation** — `VolumeMountingManager.findOne` / `findAll` passed `transaction` as a second Sequelize argument instead of inside the options object, so NATS fog reconcile could create a volume mount in an open transaction then fail to link it (`nats-server-conf-* not found`). Reads now honor the parent transaction like `BaseManager`. - **Volume mount service transaction propagation** — `VolumeMountService.linkVolumeMountEndpoint` / `unlinkVolumeMountEndpoint` passed `transaction` as a second Sequelize argument to `getFogs` / `addVolumeMount` / `removeVolumeMount` instead of inside the options object, causing NATS fog reconcile to hang when linking volume mounts after auth bootstrap. - **Fog platform reconcile stale errors** — `reconcileFogPrepare` clears `lastError` when entering `Progressing` so prior `SQLITE_BUSY` does not mask current reconcile state. @@ -166,6 +168,14 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - Dual writers to router microservice bridge config from fog create/update and service create/update/delete — single full-recompute path on fog reconcile. - SQLite startup lock contention on single-controller deployments — WAL + `busy_timeout` pragmas on connect, `withDbBusyRetry` on fog/service/NATS task claims, staggered reconcile-heavy job startup. - **`reconcileFog` transaction parameter** — removed unused `options` argument so worker-decorated calls receive the transaction correctly. +- **NATS auth post-commit orchestration** — account/user rule reissue and application NATS orchestration run in background `PRIORITY_BACKGROUND` transactions after API commit; no longer inherit committed ALS transactions (`commit has been called on this transaction`). +- **NATS resolver bundle ordering** — hub + leaf JWT bundles rebuild only after reissue/revocation commits; outbox enqueue removed from eager `scheduleReissueFor*` paths. +- **Application NATS rule / disable** — `_scheduleApplicationNatsOrchestration` post-commit with guaranteed outbox enqueue on success (R139). +- **Microservice NATS PATCH** — normalized `natsConfig` gates enable/disable/rule change; resolver bundle uses fresh account JWT reads; idempotency keys include `authGeneration` / `microserviceUuid` (R137, R140, R142). +- **User rule fan-out** — `reissueForUserRule` covers all `NatsUserManager` rows by rule id including Bearer users; revocations propagate (R143). +- **Fog router MS upstream** — router microservice config built from live router DB + connections, not stale persisted JSON; upstream topology change forces persist (R144, R145). +- **Downstream fog fan-out** — upstream interior-router or server-NATS host/port change enqueues downstream platform reconcile (R146). +- **`upstreamNatsServers` preserve-on-omit** — PATCH omitting `upstreamNatsServers` preserves existing NATS upstream connections (R147). ### Changed From 49a089dcaa9913cc1c2721f2c038c30f3c6f9b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 10:16:03 +0300 Subject: [PATCH 24/32] Raise WebSocket exec and log concurrency quota to five per resource. Align tests, operator docs, swagger, and CHANGELOG with execMaxConcurrentPerResource and logMaxConcurrentPerResource defaults. --- CHANGELOG.md | 8 ++++---- docs/architecture.md | 6 +++--- docs/operations/ws-sessions.md | 10 +++++----- docs/swagger.yaml | 8 ++++---- test/load/ws-pairing-load.js | 4 ++-- .../src/websocket/ws-exec-same-replica.test.js | 6 +++--- test/src/websocket/ws-lifecycle.test.js | 18 +++++++++--------- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aadd4aa0..e90bae7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * #### WebSocket exec — multi-session - **Microservice exec REST removed** — `POST/DELETE /api/v3/microservices/:uuid/exec` and `…/system/:uuid/exec` no longer exist. Open exec with **direct WebSocket** only: `WS /api/v3/microservices/exec/:uuid` (or `…/system/exec/:uuid`). -- **3 concurrent exec sessions** per microservice (was 1 user exec WS per MS). +- **5 concurrent exec sessions** per microservice (was 1 user exec WS per MS). - **Per-session lifecycle** — closing one exec session deletes only that session row only (no microservice-level exec flag). - **`execEnabled` removed** — dropped `microservices.exec_enabled` column and agent MS list field; exec attach is poll-driven only (`GET /agent/exec/sessions`). - **Agent exec discovery** — new `GET /api/v3/agent/exec/sessions` when change tracking reports `execSessions: true`. @@ -109,8 +109,8 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **K8s control plane:** hub **`iofog-router`** ConfigMap patches serialized via DB lock; K8s Service create/update/delete with LoadBalancer watch timeout. - **`service-bridge-config.js`** — full recompute of service-derived TCP bridge config per fog on reconcile (preserves router base config). - **SQLite single-node production hardening** — WAL + `busy_timeout` pragmas, reconcile task claim retry on `SQLITE_BUSY`, staggered startup for reconcile-heavy background jobs (`settings.jobStartupDelaySeconds`). -- **WebSocket exec & log session hardening** — quotas (**3 exec** / 3 log WS per resource), per-session exec lifecycle, 60s/120s pending timeouts, 8h exec max, 30s graceful drain, OTEL metrics, HA AMQP fail-fast, integration tests, swagger WS protocol docs, operator guide (`docs/operations/ws-sessions.md`). -- **Multi exec sessions** — `GET /api/v3/agent/exec/sessions`; agent exec WS `…/agent/exec/microservice/:uuid/:sessionId`; user ACTIVATION with `sessionId`; `MicroserviceExecSessions` table; `execMaxConcurrentPerResource` config (default **3**). +- **WebSocket exec & log session hardening** — quotas (**5 exec** / 5 log WS per resource), per-session exec lifecycle, 60s/120s pending timeouts, 8h exec max, 30s graceful drain, OTEL metrics, HA AMQP fail-fast, integration tests, swagger WS protocol docs, operator guide (`docs/operations/ws-sessions.md`). +- **Multi exec sessions** — `GET /api/v3/agent/exec/sessions`; agent exec WS `…/agent/exec/microservice/:uuid/:sessionId`; user ACTIVATION with `sessionId`; `MicroserviceExecSessions` table; `execMaxConcurrentPerResource` config (default **5**). - **WebSocket relay production** — unified **`WsRelayTransport`** abstraction; cross-replica exec/log relay backend selected at startup by **`nats.enabled`** (`NATS_ENABLED`): **AMQP** router pool (8 connections per replica, overflow recovery, sendable gating) when `false`, **NATS Core** pub/sub on platform hub (`controller-relay` account) when `true`. Fail-fast activation on both transports; log backpressure drops `LOG_LINE` under pressure. Config: `server.webSocket.relay.amqp.*`, `server.webSocket.relay.nats.*`. No new relay env var; HA swagger/docs updated per R112. - **ransaction safety** — unified **`runInTransaction()`** write path for API, jobs, and WebSocket cleanup; **`fakeTransaction`** and **`bypassQueue`** removed; **`ReconcileOutbox`** transactional outbox with background drainer; SQLite priority write queue (`interactive` > `background`); mysql/postgres reconcile task claims use **`FOR UPDATE SKIP LOCKED`**; OTEL DB metrics and ops runbook (`docs/operations/database-transactions.md`). **Breaking: internal only** — no agent wire or public REST shape changes. - **— pre-close transaction audit** — fixes SQLite hangs from nested `generateTransaction` (`certificate-service` → `SecretService` tx propagation), JTI cleanup job queue bypass, OAuth interaction OIDC reads outside tx, external-mode user IdP HTTP outside tx, service platform LoadBalancer watch outside long tx; extended grep gates and unit tests. threads optional `transaction` through `cert.js` `loadCA` / `getCAFromK8sSecret` / `getCAFromInput` so fog platform reconcile no longer deadlocks on SQLite when signing site-server certs after router-site-ca. NATS hub ConfigMap cluster routes, StatefulSet rollout, and JWT bundle K8s patches moved outside DB transaction bodies in `nats-service.js` (phased reconcile + `afterCommit` deferral when called from `reconcileFog`). HashiCorp Vault HTTP for secret/configmap/registry create/update/delete deferred via `transaction.afterCommit` (`vault-transaction-helper.js`); DB rows use internal encryption during tx, vault store/delete after commit. splits `FogPlatformService.reconcileFog` into phased background transactions (`prepare` → `certPrep` → NATS self-tx → `platform` → `finalize`) mirroring service-platform reconcile — no single tx spans cert generation, NATS, and router reconcile end-to-end. AMQP router cert provisioning in one transaction; agent CA endpoint without pointless DB tx; removed unused services-service TCP bridge K8s-in-tx helpers (operator CRUD uses enqueue + service-platform reconcile only); OIDC provider adapter routed through write queue. @@ -128,7 +128,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **WebSocket exec/log session cleanup race (postgres / NATS relay)** — concurrent teardown paths (pending timeout + disconnect, NATS CLOSE + CLOSE ack, relay callback + `ws.on('close')`) reused one Sequelize transaction via AsyncLocalStorage, causing `commit has been called on this transaction` on session row delete. Exec and log cleanup are deduplicated per `sessionId`, use fresh background transactions, pending timeouts only close sockets, and relay CLOSE acks no longer trigger DB teardown. - **WebSocket exec/log cross-replica pairing** — pending timeouts no longer require a local `session.agent`; user pods mark `remoteAgentPaired` via relay delivery hooks and DB fallback (`agentConnected`). Agent pods publish **ACTIVATION** (exec) and **LOG_LINE** user notifications via NATS/AMQP relay when the user is on another replica. Same-replica log “agent connected” notify uses **LOG_LINE** (not `LOG_START` + embedded message). **`ws_pending_pairings`** and **`ws_pairing_duration_ms`** metrics are recorded from user connect through pairing completion or timeout. Cross-replica paired sessions use **max/idle duration** (not pending timeout) in periodic cleanup; agent disconnect on an agent-only pod relays **CLOSE** (exec) or **LOG_LINE** (log) to the user pod and detaches local state without deleting the DB row. - **WebSocket cross-replica exec activation** — `setupExecMessageForwarding` read `shouldUseRelay` before `enableForSession`, so agent-only pods skipped relay **ACTIVATION** and user notify on first connect (log setup was already correct). Info logs added for log session user/agent disconnect, full cleanup, and local detach. -- **WebSocket exec/log orphan session cleanup (multi-replica HA)** — agent partial disconnect no longer triggers solely because relay is enabled; teardown uses DB `userConnected` (not stale `remoteUserPaired`). Full DB delete when both sides are gone. Concurrency limits and `GET /agent/logs/sessions` / `GET /agent/exec/sessions` count or list only `userConnected: true` rows. Reconcile job immediately removes rows with both flags false. Same-replica user disconnect still full-cleans when the agent socket is local. +- **WebSocket exec/log orphan session cleanup (multi-replica HA)** — agent partial disconnect no longer triggers solely because relay is enabled; teardown uses DB `userConnected` (not stale `remoteUserPaired`). Full DB delete when both sides are gone. Concurrency limits raised to **5** per resource; `GET /agent/logs/sessions` / `GET /agent/exec/sessions` count or list only `userConnected: true` rows. Reconcile job immediately removes rows with both flags false. Same-replica user disconnect still full-cleans when the agent socket is local. - **Volume mount manager transaction propagation** — `VolumeMountingManager.findOne` / `findAll` passed `transaction` as a second Sequelize argument instead of inside the options object, so NATS fog reconcile could create a volume mount in an open transaction then fail to link it (`nats-server-conf-* not found`). Reads now honor the parent transaction like `BaseManager`. - **Volume mount service transaction propagation** — `VolumeMountService.linkVolumeMountEndpoint` / `unlinkVolumeMountEndpoint` passed `transaction` as a second Sequelize argument to `getFogs` / `addVolumeMount` / `removeVolumeMount` instead of inside the options object, causing NATS fog reconcile to hang when linking volume mounts after auth bootstrap. - **Fog platform reconcile stale errors** — `reconcileFogPrepare` clears `lastError` when entering `Progressing` so prior `SQLITE_BUSY` does not mask current reconcile state. diff --git a/docs/architecture.md b/docs/architecture.md index 7e9ce240..ebcb78ed 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -206,7 +206,7 @@ Full spec: [`.cursor/controllerv3.8/docs/15-fog-platform-reconcile.md`](../.curs ## WebSocket exec & log sessions -Interactive **exec** and **log streaming** use paired WebSocket sessions between operators (Bearer JWT), Controller, and Edgelet agents (fog token). Plan 16 hardens log sessions and shared WS infra (HA, drain, OTEL). **Plan 17** redesigns **microservice exec** to log-style multi-session flow (3 concurrent per MS, agent poll + session-scoped WS). **Plan 18** production-hardens cross-replica relay via **`WsRelayTransport`** — AMQP pool + recovery when `nats.enabled=false`, NATS Core when `nats.enabled=true` (R102–R113). **Edgelet agent wire change required** for exec only (see [edgelet-invariants.md §10.1](../.cursor/controllerv3.8/docs/edgelet-invariants.md)). +Interactive **exec** and **log streaming** use paired WebSocket sessions between operators (Bearer JWT), Controller, and Edgelet agents (fog token). Plan 16 hardens log sessions and shared WS infra (HA, drain, OTEL). **Plan 17** redesigns **microservice exec** to log-style multi-session flow (5 concurrent per MS, agent poll + session-scoped WS). **Plan 18** production-hardens cross-replica relay via **`WsRelayTransport`** — AMQP pool + recovery when `nats.enabled=false`, NATS Core when `nats.enabled=true` (R102–R113). **Edgelet agent wire change required** for exec only (see [edgelet-invariants.md §10.1](../.cursor/controllerv3.8/docs/edgelet-invariants.md)). ```mermaid flowchart TB @@ -283,14 +283,14 @@ sequenceDiagram | Topic | Normative value | |-------|-----------------| | MS exec entry | **Direct user WS** — no `POST …/microservices/…/exec` (R92, R94) | -| MS exec concurrency | **3** user exec WS per microservice (R93) | +| MS exec concurrency | **5** user exec WS per microservice | | MS exec lifecycle | **Per-session** — close deletes session row only; **no** `execEnabled=false` (R98) | | MS exec pending / max | **60s** pending for agent; **8h** max active session (Plan 16 carry-over) | | Agent exec discovery | `GET /agent/exec/sessions` on `execSessions` change flag (R95, R100) | | Agent exec WS | `/agent/exec/microservice/:uuid/:sessionId` only — legacy `/agent/exec/:uuid` removed (R96) | | User session notify | **ACTIVATION** (type 5) with `{ sessionId, microserviceUuid }` (R97) | | Fog debug provision | `POST/DELETE /iofog/:uuid/exec` unchanged; shell via `WS /microservices/system/exec/:debugMsUuid` (R99) | -| Log concurrency | **3** user log WS per microservice (or per fog for node logs) | +| Log concurrency | **5** user log WS per microservice (or per fog for node logs) | | Log limits | Tail max **5,000** lines; **120s** pending; **2h** idle | | Log content | Live relay only — no log line persistence; audit connect/disconnect | | HA relay | Cross-replica sessions **require** a **relay backend** (R112): **AMQP** router queues when `nats.enabled=false`; **NATS Core** subjects on hub when `nats.enabled=true`. Same-replica may use direct WS; **fail fast** close **1013** when active backend unavailable | diff --git a/docs/operations/ws-sessions.md b/docs/operations/ws-sessions.md index 34e268a1..d6f5fa3e 100644 --- a/docs/operations/ws-sessions.md +++ b/docs/operations/ws-sessions.md @@ -18,7 +18,7 @@ Controller exposes **interactive exec** and **log streaming** over WebSocket on | **User auth** | Bearer JWT via `Authorization` header or `?token=` query param (browser Console). RBAC: `execSessions`, `logs`, `systemExecSessions`, `systemLogs`. | | **Agent auth** | Fog token on `/api/v3/agent/exec/*` and `/api/v3/agent/logs/*` — OIDC does **not** apply to agent routes. | -> **Plan 17 (MS exec):** Open exec with **direct WebSocket** — `wss://…/api/v3/microservices/exec/:uuid` (app MS) or `…/system/exec/:uuid` (system MS). **No** `POST …/exec` before connect. Up to **3** concurrent exec sessions per microservice. Agent discovers sessions via `GET /api/v3/agent/exec/sessions` and connects `WS /api/v3/agent/exec/microservice/:uuid/:sessionId`. Fog node debug: `POST/DELETE /api/v3/iofog/:uuid/exec` provisions the debug system MS, then **`WS …/microservices/system/exec/:debugMsUuid`** (not the app exec path). Full spec: [17-multi-exec-sessions.md](../.cursor/controllerv3.8/docs/17-multi-exec-sessions.md). +> **Plan 17 (MS exec):** Open exec with **direct WebSocket** — `wss://…/api/v3/microservices/exec/:uuid` (app MS) or `…/system/exec/:uuid` (system MS). **No** `POST …/exec` before connect. Up to **5** concurrent exec sessions per microservice. Agent discovers sessions via `GET /api/v3/agent/exec/sessions` and connects `WS /api/v3/agent/exec/microservice/:uuid/:sessionId`. Fog node debug: `POST/DELETE /api/v3/iofog/:uuid/exec` provisions the debug system MS, then **`WS …/microservices/system/exec/:debugMsUuid`** (not the app exec path). Full spec: [17-multi-exec-sessions.md](../.cursor/controllerv3.8/docs/17-multi-exec-sessions.md). ### Ingress log redaction (required) @@ -124,7 +124,7 @@ spec: |--------|--------| | Concurrent WS per replica | **500** (`WS_REPLICA_MAX_CONCURRENT_WS`) | | p99 exec pairing latency | **< 5s** | -| Exec sessions per microservice | **3** concurrent user WS (Plan 17) | +| Exec sessions per microservice | **5** concurrent user WS | Run the load probe locally: @@ -134,7 +134,7 @@ node test/load/ws-pairing-load.js --pairs 500 node test/load/ws-pairing-load.js --multi-ms 100 ``` -The `--multi-ms` mode creates **3 exec sessions per microservice** (100 MS × 3 = 300 pairs) to validate multi-session pairing latency under the same p99 SLO. +The `--multi-ms` mode creates **5 exec sessions per microservice** (100 MS × 5 = 500 pairs) to validate multi-session pairing latency under the same p99 SLO. **AMQP profile** (`nats.enabled=false`): run the probe above on a dev machine — it exercises in-process `ExecSessionManager` pairing only (no router required). Record p99 from stdout; target **< 5000 ms**. @@ -170,10 +170,10 @@ Enable `ENABLE_TELEMETRY=true`. Key metrics (`src/websocket/ws-metrics.js`): | Session | Limit | |---------|-------| -| Exec user WS per microservice | **3** (Plan 17 — direct WS; no POST/DELETE MS exec REST) | +| Exec user WS per microservice | **5** (direct WS; no POST/DELETE MS exec REST) | | Exec pending (user waits for agent) | **60s** | | Exec max duration | **8h** | -| Log user WS per microservice/fog | **3** | +| Log user WS per microservice/fog | **5** | | Log pending (user waits for agent) | **120s** | | Log idle | **2h** | | Log tail max lines | **5000** | diff --git a/docs/swagger.yaml b/docs/swagger.yaml index ea133080..213e90d6 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -2767,7 +2767,7 @@ paths: Pairing: user connects directly via WebSocket; Controller creates a per-session row in `MicroserviceExecSessions` and relays STDIN/STDOUT/STDERR by `sessionId`. - No prior REST enable step is required (Plan 17). Max **3** concurrent exec sessions + No prior REST enable step is required (Plan 17). Max **5** concurrent exec sessions per microservice (close code **1008** when quota exceeded). On connect, Controller sends **ACTIVATION** (type 5) with JSON @@ -2830,7 +2830,7 @@ paths: description: | Upgrades to binary MessagePack log stream (agent → user after pairing). - **RBAC:** `logs`. Max **3** concurrent user log WS per microservice. + **RBAC:** `logs`. Max **5** concurrent user log WS per microservice. Query params control tail behaviour (see parameters). Live relay only — no log persistence. operationId: userMicroserviceLogsWebSocket @@ -2922,7 +2922,7 @@ paths: tags: - WebSocketSessions summary: User fog (node) log streaming WebSocket - description: Node-level logs; **RBAC** `systemLogs`. Max **3** concurrent sessions per fog. + description: Node-level logs; **RBAC** `systemLogs`. Max **5** concurrent sessions per fog. operationId: userFogLogsWebSocket parameters: - in: path @@ -7160,7 +7160,7 @@ components: codes: - { code: 1000, reason: Normal closure, when: Session ended cleanly } - { code: 1001, reason: Server draining, when: SIGTERM / k8s preStop (R85) } - - { code: 1008, reason: Policy violation, when: RBAC deny, exec/log quota (3 per MS), invalid tail params, pending timeout, sessionId mismatch } + - { code: 1008, reason: Policy violation, when: RBAC deny, exec/log quota (5 per MS), invalid tail params, pending timeout, sessionId mismatch } - { code: 1013, reason: Router unavailable for cross-replica session, when: AMQP router down (R84) } EventRecord: type: object diff --git a/test/load/ws-pairing-load.js b/test/load/ws-pairing-load.js index c883fc00..d98e47e9 100644 --- a/test/load/ws-pairing-load.js +++ b/test/load/ws-pairing-load.js @@ -10,7 +10,7 @@ * node test/load/ws-pairing-load.js --pairs 500 * node test/load/ws-pairing-load.js --multi-ms 100 * - * --multi-ms N: create 3 exec sessions per microservice (Plan 17 quota) for N microservices. + * --multi-ms N: create 5 exec sessions per microservice (concurrency quota) for N microservices. * * Exit 0 when p99 < 5000ms; exit 1 otherwise. */ @@ -29,7 +29,7 @@ function parseArg (name, fallback) { const PAIR_COUNT = parseInt(parseArg('pairs', '500'), 10) const MULTI_MS_COUNT = parseInt(parseArg('multi-ms', '0'), 10) -const SESSIONS_PER_MS = 3 +const SESSIONS_PER_MS = 5 const FAST_CONFIG = { session: { diff --git a/test/src/websocket/ws-exec-same-replica.test.js b/test/src/websocket/ws-exec-same-replica.test.js index aea148b5..fd8f2053 100644 --- a/test/src/websocket/ws-exec-same-replica.test.js +++ b/test/src/websocket/ws-exec-same-replica.test.js @@ -196,7 +196,7 @@ describe('WebSocket exec — same-replica integration (Plan 17)', () => { ) }) - it('allows three concurrent exec sessions on same microservice', async () => { + it('allows five concurrent exec sessions on same microservice', async () => { let dbSessionCount = 0 $sandbox.stub(wsServer, 'countExecSessionsInDb').callsFake(async () => dbSessionCount) MicroserviceExecSessionManager.create.restore() @@ -213,7 +213,7 @@ describe('WebSocket exec — same-replica integration (Plan 17)', () => { }) const userSockets = [] - for (let i = 0; i < 3; i++) { + for (let i = 0; i < 5; i++) { const ws = createMockWebSocket() userSockets.push(ws) const req = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) @@ -229,7 +229,7 @@ describe('WebSocket exec — same-replica integration (Plan 17)', () => { expect(ws.readyState).to.equal(WebSocket.OPEN) } - expect(wsServer.execSessionManager.countSessionsForResource($ids.microserviceUuid)).to.equal(3) + expect(wsServer.execSessionManager.countSessionsForResource($ids.microserviceUuid)).to.equal(5) const rejectedWs = createMockWebSocket() const rejectedReq = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) diff --git a/test/src/websocket/ws-lifecycle.test.js b/test/src/websocket/ws-lifecycle.test.js index 41cec233..59299310 100644 --- a/test/src/websocket/ws-lifecycle.test.js +++ b/test/src/websocket/ws-lifecycle.test.js @@ -25,7 +25,7 @@ const FAST_CONFIG = { execMaxDurationMs: 200, logPendingTimeoutMs: 100, logIdleTimeoutMs: 500, - logMaxConcurrentPerResource: 3, + logMaxConcurrentPerResource: 5, logTailMaxLines: 5000, cleanupInterval: 50 } @@ -40,22 +40,22 @@ describe('WebSocket session lifecycle', () => { $sandbox.restore() }) - describe('exec 3-session quota', () => { + describe('exec 5-session quota', () => { let wsServer beforeEach(() => { resetWebSocketServerSingleton(WebSocketServerClass) wsServer = new WebSocketServerClass() - wsServer.sessionConfig = { ...wsServer.sessionConfig, execMaxConcurrentPerResource: 3 } + wsServer.sessionConfig = { ...wsServer.sessionConfig, execMaxConcurrentPerResource: 5 } }) afterEach(() => { resetWebSocketServerSingleton(WebSocketServerClass) }) - it('rejects fourth concurrent exec session for same microservice', async () => { + it('rejects sixth concurrent exec session for same microservice', async () => { $sandbox.stub(wsServer, 'validateUserConnection').resolves({ uuid: $ids.microserviceUuid }) - $sandbox.stub(wsServer, 'countExecSessionsInDb').resolves(3) + $sandbox.stub(wsServer, 'countExecSessionsInDb').resolves(5) const ws = createMockWebSocket() const req = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) @@ -73,22 +73,22 @@ describe('WebSocket session lifecycle', () => { }) }) - describe('log 3-viewer quota', () => { + describe('log 5-viewer quota', () => { let wsServer beforeEach(() => { resetWebSocketServerSingleton(WebSocketServerClass) wsServer = new WebSocketServerClass() - wsServer.sessionConfig = { ...wsServer.sessionConfig, logMaxConcurrentPerResource: 3 } + wsServer.sessionConfig = { ...wsServer.sessionConfig, logMaxConcurrentPerResource: 5 } }) afterEach(() => { resetWebSocketServerSingleton(WebSocketServerClass) }) - it('rejects fourth concurrent log session for same microservice', async () => { + it('rejects sixth concurrent log session for same microservice', async () => { $sandbox.stub(wsServer, 'validateUserLogsConnection').resolves({ success: true }) - $sandbox.stub(wsServer, 'countLogSessionsInDb').resolves(3) + $sandbox.stub(wsServer, 'countLogSessionsInDb').resolves(5) $sandbox.stub(wsServer, 'isValidISO8601').returns(true) const ws = createMockWebSocket() From b993c4ab75092f6304c9ddb8831d7489071fef2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 15:56:13 +0300 Subject: [PATCH 25/32] Send RFC 6455 Ping frames on exec and log WebSocket sessions. Keeps idle agent and browser legs alive through ingress and Edgelet read deadlines; documents WS_PING_INTERVAL and operator keepalive layers. --- CHANGELOG.md | 1 + docs/operations/ws-sessions.md | 15 +++ src/websocket/server.js | 49 ++++++++ test/src/websocket/ws-heartbeat.test.js | 146 ++++++++++++++++++++++++ 4 files changed, 211 insertions(+) create mode 100644 test/src/websocket/ws-heartbeat.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index e90bae7e..480ec2ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -129,6 +129,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **WebSocket exec/log cross-replica pairing** — pending timeouts no longer require a local `session.agent`; user pods mark `remoteAgentPaired` via relay delivery hooks and DB fallback (`agentConnected`). Agent pods publish **ACTIVATION** (exec) and **LOG_LINE** user notifications via NATS/AMQP relay when the user is on another replica. Same-replica log “agent connected” notify uses **LOG_LINE** (not `LOG_START` + embedded message). **`ws_pending_pairings`** and **`ws_pairing_duration_ms`** metrics are recorded from user connect through pairing completion or timeout. Cross-replica paired sessions use **max/idle duration** (not pending timeout) in periodic cleanup; agent disconnect on an agent-only pod relays **CLOSE** (exec) or **LOG_LINE** (log) to the user pod and detaches local state without deleting the DB row. - **WebSocket cross-replica exec activation** — `setupExecMessageForwarding` read `shouldUseRelay` before `enableForSession`, so agent-only pods skipped relay **ACTIVATION** and user notify on first connect (log setup was already correct). Info logs added for log session user/agent disconnect, full cleanup, and local detach. - **WebSocket exec/log orphan session cleanup (multi-replica HA)** — agent partial disconnect no longer triggers solely because relay is enabled; teardown uses DB `userConnected` (not stale `remoteUserPaired`). Full DB delete when both sides are gone. Concurrency limits raised to **5** per resource; `GET /agent/logs/sessions` / `GET /agent/exec/sessions` count or list only `userConnected: true` rows. Reconcile job immediately removes rows with both flags false. Same-replica user disconnect still full-cleans when the agent socket is local. +- **WebSocket protocol heartbeat (exec + log)** — Controller sends RFC 6455 **Ping** frames on all four session sockets (user/agent, exec/log) every **`WS_PING_INTERVAL`** (default 30s). Keeps idle agent log/exec streams alive through Edgelet read deadlines and ingress; browsers auto-respond with Pong. Application **`CONTROL/keepalive`** on user exec unchanged (EdgeOps Console contract). No server-side pong-timeout terminate in v1. - **Volume mount manager transaction propagation** — `VolumeMountingManager.findOne` / `findAll` passed `transaction` as a second Sequelize argument instead of inside the options object, so NATS fog reconcile could create a volume mount in an open transaction then fail to link it (`nats-server-conf-* not found`). Reads now honor the parent transaction like `BaseManager`. - **Volume mount service transaction propagation** — `VolumeMountService.linkVolumeMountEndpoint` / `unlinkVolumeMountEndpoint` passed `transaction` as a second Sequelize argument to `getFogs` / `addVolumeMount` / `removeVolumeMount` instead of inside the options object, causing NATS fog reconcile to hang when linking volume mounts after auth bootstrap. - **Fog platform reconcile stale errors** — `reconcileFogPrepare` clears `lastError` when entering `Progressing` so prior `SQLITE_BUSY` does not mask current reconcile state. diff --git a/docs/operations/ws-sessions.md b/docs/operations/ws-sessions.md index d6f5fa3e..56ec6402 100644 --- a/docs/operations/ws-sessions.md +++ b/docs/operations/ws-sessions.md @@ -118,6 +118,21 @@ spec: --- +## Connection keepalive + +Controller uses **two layers** for long-lived exec/log WebSockets: + +| Layer | Mechanism | Peers | +|-------|-----------|-------| +| **WebSocket protocol** | Server sends RFC 6455 **Ping** frames every **`server.webSocket.pingInterval`** (default **30s**, env **`WS_PING_INTERVAL`**) | All four sockets: user + agent on exec and log paths (Edgelet, potctl, EdgeOps Console browser) | +| **Application (exec user only)** | MessagePack **`CONTROL`** with payload **`keepalive`** | EdgeOps Console exec terminal — Controller echoes **`keepalive`** to the user socket; browsers cannot send native WS ping frames | + +Log streaming does **not** use application-level keepalive (Console `LogViewer` is receive-only). Quiet `follow=true` log sessions rely on WS protocol ping to keep the agent and browser legs alive through ingress and Edgelet read deadlines. + +`server.webSocket.pongTimeout` is reserved for future server-side watchdog use; v1 does **not** terminate sessions on missed pongs (Console exec owns the 10s app-level watchdog). + +--- + ## Scale SLO (R88) | Metric | Target | diff --git a/src/websocket/server.js b/src/websocket/server.js index 4dc3b83f..eab4ca3f 100644 --- a/src/websocket/server.js +++ b/src/websocket/server.js @@ -230,6 +230,47 @@ class WebSocketServer { this._handleExpiredLogSession(sessionId, session, transaction)) } + _startWebSocketHeartbeat (ws, { label, sessionId } = {}) { + if (!ws) { + return + } + this._stopWebSocketHeartbeat(ws) + + const intervalMs = Number(this.config.pingInterval) + if (!Number.isFinite(intervalMs) || intervalMs <= 0) { + return + } + + ws._heartbeatTimer = setInterval(() => { + if (ws.readyState === WebSocket.OPEN) { + try { + ws.ping() + } catch (error) { + logger.debug('[WS-HEARTBEAT] Failed to send ping frame', { + label: label || null, + sessionId: sessionId || null, + error: error.message + }) + } + } + }, intervalMs) + + if (!ws._heartbeatCloseRegistered) { + ws._heartbeatCloseRegistered = true + ws.on('close', () => { + this._stopWebSocketHeartbeat(ws) + }) + } + } + + _stopWebSocketHeartbeat (ws) { + if (!ws || ws._heartbeatTimer == null) { + return + } + clearInterval(ws._heartbeatTimer) + ws._heartbeatTimer = null + } + // MessagePack encoding/decoding helpers with improved error handling encodeMessage (message) { try { @@ -1851,6 +1892,7 @@ class WebSocketServer { execSession.metricsActive = true recordExecSessionActive(1) this._startExecPendingPairingMetrics(execSession) + this._startWebSocketHeartbeat(ws, { label: 'user-exec', sessionId }) const activationMsg = { type: MESSAGE_TYPES.ACTIVATION, @@ -2027,6 +2069,7 @@ class WebSocketServer { session.lastActivity = Date.now() session.activationSent = false } + this._startWebSocketHeartbeat(ws, { label: 'agent-exec', sessionId }) this._scheduleRelaySetupAfterCommit( 'setup exec message forwarding', @@ -2809,6 +2852,7 @@ class WebSocketServer { logSession.metricsActive = true recordLogSessionActive(1) this._startLogPendingPairingMetrics(logSession) + this._startWebSocketHeartbeat(ws, { label: 'user-log', sessionId }) // 7. Send sessionId to user (MessagePack encoded) const sessionInfoMsg = { @@ -3041,6 +3085,7 @@ class WebSocketServer { session.agent = ws session.lastActivity = Date.now() } + this._startWebSocketHeartbeat(ws, { label: 'agent-log', sessionId }) // 5.5. Set up message handler IMMEDIATELY on the agent WebSocket // This ensures messages are captured even if they arrive before setupLogMessageForwarding completes @@ -3417,6 +3462,8 @@ class WebSocketServer { const session = this.logSessionManager.getLogSession(sessionId) if (session) { this._clearPendingPairingTimer(session) + this._stopWebSocketHeartbeat(session.user) + this._stopWebSocketHeartbeat(session.agent) } if (session && session.metricsActive) { recordLogSessionActive(-1) @@ -3699,6 +3746,8 @@ class WebSocketServer { const session = this.execSessionManager.getExecSession(sessionId) if (session) { this._clearPendingPairingTimer(session) + this._stopWebSocketHeartbeat(session.user) + this._stopWebSocketHeartbeat(session.agent) } if (session && session.metricsActive) { recordExecSessionActive(-1) diff --git a/test/src/websocket/ws-heartbeat.test.js b/test/src/websocket/ws-heartbeat.test.js new file mode 100644 index 00000000..d05a58eb --- /dev/null +++ b/test/src/websocket/ws-heartbeat.test.js @@ -0,0 +1,146 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const WebSocketServerClass = require('../../../src/websocket/server') +const { createMockWebSocket, resetWebSocketServerSingleton } = require('../../support/ws-session-harness') + +describe('WebSocket protocol heartbeat', () => { + let sandbox + let wsServer + + beforeEach(() => { + sandbox = sinon.createSandbox() + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + wsServer.config.pingInterval = 1000 + }) + + afterEach(() => { + sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + function createHeartbeatSocket () { + const ws = createMockWebSocket() + ws.ping = sandbox.spy() + return ws + } + + it('sends ws.ping on interval while socket is open', () => { + const clock = sandbox.useFakeTimers() + const ws = createHeartbeatSocket() + + wsServer._startWebSocketHeartbeat(ws, { label: 'user-exec', sessionId: 'sess-1' }) + + expect(ws.ping.called).to.equal(false) + clock.tick(1000) + expect(ws.ping.calledOnce).to.equal(true) + clock.tick(1000) + expect(ws.ping.calledTwice).to.equal(true) + + wsServer._stopWebSocketHeartbeat(ws) + clock.tick(5000) + expect(ws.ping.calledTwice).to.equal(true) + }) + + it('does not ping after socket closes', () => { + const clock = sandbox.useFakeTimers() + const ws = createHeartbeatSocket() + + wsServer._startWebSocketHeartbeat(ws, { label: 'agent-log', sessionId: 'sess-2' }) + clock.tick(1000) + expect(ws.ping.calledOnce).to.equal(true) + + ws.readyState = WebSocket.CLOSED + clock.tick(3000) + expect(ws.ping.calledOnce).to.equal(true) + }) + + it('stops heartbeat and clears timer on close event', () => { + const clock = sandbox.useFakeTimers() + const ws = createHeartbeatSocket() + + wsServer._startWebSocketHeartbeat(ws, { label: 'user-log', sessionId: 'sess-3' }) + clock.tick(1000) + expect(ws.ping.calledOnce).to.equal(true) + + ws.emit('close', 1000, 'normal') + expect(ws._heartbeatTimer).to.equal(null) + + clock.tick(5000) + expect(ws.ping.calledOnce).to.equal(true) + }) + + it('replaces existing timer when heartbeat is restarted', () => { + const clock = sandbox.useFakeTimers() + const ws = createHeartbeatSocket() + + wsServer._startWebSocketHeartbeat(ws, { label: 'agent-exec', sessionId: 'sess-4' }) + const firstTimer = ws._heartbeatTimer + wsServer._startWebSocketHeartbeat(ws, { label: 'agent-exec', sessionId: 'sess-4' }) + + expect(ws._heartbeatTimer).to.not.equal(firstTimer) + clock.tick(1000) + expect(ws.ping.calledOnce).to.equal(true) + }) + + it('skips heartbeat when pingInterval is disabled', () => { + const clock = sandbox.useFakeTimers() + const ws = createHeartbeatSocket() + wsServer.config.pingInterval = 0 + + wsServer._startWebSocketHeartbeat(ws, { label: 'user-exec', sessionId: 'sess-5' }) + + expect(ws._heartbeatTimer).to.equal(undefined) + clock.tick(5000) + expect(ws.ping.called).to.equal(false) + }) + + it('stops heartbeat for both peers during exec session cleanup', async () => { + const user = createHeartbeatSocket() + const agent = createHeartbeatSocket() + const sessionId = 'exec-cleanup' + const transaction = { fakeTransaction: true } + + wsServer.execSessionManager.createExecSession(sessionId, 'ms-uuid', agent, user, transaction) + wsServer._startWebSocketHeartbeat(user, { label: 'user-exec', sessionId }) + wsServer._startWebSocketHeartbeat(agent, { label: 'agent-exec', sessionId }) + + sandbox.stub(wsServer.execSessionManager, 'removeExecSession').resolves() + sandbox.stub(wsServer.relayTransport, 'cleanup').resolves() + sandbox.stub(wsServer, '_notifyExecRemotePeerClose').resolves() + + await wsServer.cleanupExecSession(sessionId, transaction) + + expect(user._heartbeatTimer).to.equal(null) + expect(agent._heartbeatTimer).to.equal(null) + }) + + it('stops heartbeat for both peers during log session cleanup', async () => { + const user = createHeartbeatSocket() + const agent = createHeartbeatSocket() + const sessionId = 'log-cleanup' + const transaction = { fakeTransaction: true } + + wsServer.logSessionManager.createLogSession( + sessionId, + 'ms-uuid', + null, + agent, + user, + { tail: 100, follow: true }, + transaction + ) + wsServer._startWebSocketHeartbeat(user, { label: 'user-log', sessionId }) + wsServer._startWebSocketHeartbeat(agent, { label: 'agent-log', sessionId }) + + sandbox.stub(wsServer.logSessionManager, 'removeLogSession').resolves() + sandbox.stub(wsServer.relayTransport, 'cleanupLogSession').resolves() + + await wsServer.cleanupLogSession(sessionId, transaction) + + expect(user._heartbeatTimer).to.equal(null) + expect(agent._heartbeatTimer).to.equal(null) + }) +}) From 5194ebbc7ceadc6d2b8ef545c2b6bf5e602425b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 15:56:16 +0300 Subject: [PATCH 26/32] Wait for exec session teardown in cross-replica split test. Replace fixed delay with waitUntil so user disconnect assertions run after in-memory session cleanup completes. --- test/src/websocket/ws-cross-replica-split.test.js | 5 ++++- test/support/ws-session-harness.js | 14 +++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/test/src/websocket/ws-cross-replica-split.test.js b/test/src/websocket/ws-cross-replica-split.test.js index 8409b6fc..542259ae 100644 --- a/test/src/websocket/ws-cross-replica-split.test.js +++ b/test/src/websocket/ws-cross-replica-split.test.js @@ -20,6 +20,7 @@ const { resetWebSocketServerSingleton, newTestIds, waitForSent, + waitUntil, delay } = require('../../support/ws-session-harness') const { resetTransportForTests } = require('../../../src/services/ws-relay-transport-factory') @@ -431,7 +432,9 @@ describe('WebSocket exec/log — split replica pairing', () => { MicroserviceExecSessionManager.deleteBySessionId.resetHistory() userWs.close() - await delay(50) + await waitUntil( + () => serverA.execSessionManager.getExecSession($ids.sessionId) === null + ) expect(MicroserviceExecSessionManager.deleteBySessionId).to.not.have.been.called expect(execRow.userConnected).to.equal(false) diff --git a/test/support/ws-session-harness.js b/test/support/ws-session-harness.js index adf209bc..f8f34b13 100644 --- a/test/support/ws-session-harness.js +++ b/test/support/ws-session-harness.js @@ -301,6 +301,17 @@ function waitForSent (ws, minCount = 1, timeoutMs = 2000) { }) } +async function waitUntil (predicate, timeoutMs = 2000, intervalMs = 10) { + const deadline = Date.now() + timeoutMs + while (Date.now() < deadline) { + if (predicate()) { + return + } + await delay(intervalMs) + } + throw new Error(`Timed out after ${timeoutMs}ms waiting for condition`) +} + module.exports = { MESSAGE_TYPES, WS_CLOSE_CODES, @@ -318,5 +329,6 @@ module.exports = { buildFakeJwt, newTestIds, lastSentBinary, - waitForSent + waitForSent, + waitUntil } From 78126f2847aa1dc842fbb54fe293f58dad3f1fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 15:56:20 +0300 Subject: [PATCH 27/32] Add read-only network topology API for router and NATS graphs. Expose summary, overview, node, connection, and subgraph endpoints with RBAC and OpenAPI coverage for EdgeOps Console topology views. --- docs/swagger.yaml | 745 +++++++++++++++++- src/config/rbac-resources.yaml | 47 ++ src/config/rbac-system-roles.js | 6 +- .../network-topology-controller.js | 69 ++ src/routes/network-topology.js | 84 ++ src/schemas/network-topology.js | 61 ++ src/services/network-topology-service.js | 682 ++++++++++++++++ .../network-topology-controller.test.js | 27 + .../services/network-topology-service.test.js | 293 +++++++ 9 files changed, 2010 insertions(+), 4 deletions(-) create mode 100644 src/controllers/network-topology-controller.js create mode 100644 src/routes/network-topology.js create mode 100644 src/schemas/network-topology.js create mode 100644 src/services/network-topology-service.js create mode 100644 test/src/controllers/network-topology-controller.test.js create mode 100644 test/src/services/network-topology-service.test.js diff --git a/docs/swagger.yaml b/docs/swagger.yaml index 213e90d6..ef1e24ea 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -6056,6 +6056,406 @@ paths: description: RoleBinding Not Found "500": description: Internal Server Error + /network-topology/summary: + get: + tags: + - NetworkTopology + summary: Network topology summary counts + operationId: getNetworkTopologySummary + security: + - authToken: [] + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/NetworkTopologySummaryResponse" + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/router/overview: + get: + tags: + - NetworkTopology + summary: Router topology overview + operationId: getRouterTopologyOverview + security: + - authToken: [] + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/RouterTopologyOverviewResponse" + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/nats/overview: + get: + tags: + - NetworkTopology + summary: NATS topology overview + operationId: getNatsTopologyOverview + security: + - authToken: [] + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/NatsTopologyOverviewResponse" + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/router/nodes: + get: + tags: + - NetworkTopology + summary: List router topology nodes + operationId: listRouterTopologyNodes + security: + - authToken: [] + parameters: + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + - $ref: "#/components/parameters/NetworkTopologyOffsetParam" + - $ref: "#/components/parameters/NetworkTopologyRouterRoleParam" + - $ref: "#/components/parameters/NetworkTopologyDeploymentTargetParam" + - $ref: "#/components/parameters/NetworkTopologySearchParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/RouterTopologyNodeListResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/nats/nodes: + get: + tags: + - NetworkTopology + summary: List NATS topology nodes + operationId: listNatsTopologyNodes + security: + - authToken: [] + parameters: + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + - $ref: "#/components/parameters/NetworkTopologyOffsetParam" + - $ref: "#/components/parameters/NetworkTopologyNatsRoleParam" + - $ref: "#/components/parameters/NetworkTopologyDeploymentTargetParam" + - $ref: "#/components/parameters/NetworkTopologySearchParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/NatsTopologyNodeListResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/router/nodes/{id}: + get: + tags: + - NetworkTopology + summary: Get a router topology node + operationId: getRouterTopologyNode + security: + - authToken: [] + parameters: + - in: path + name: id + required: true + schema: + type: string + description: Router node id (`default-router` or agent iofog UUID) + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/RouterTopologyNodeDetail" + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /network-topology/nats/nodes/{id}: + get: + tags: + - NetworkTopology + summary: Get a NATS topology node + operationId: getNatsTopologyNode + security: + - authToken: [] + parameters: + - in: path + name: id + required: true + schema: + type: string + description: NATS node id (`default-nats-hub` or agent iofog UUID) + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/NatsTopologyNodeDetail" + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /network-topology/router/nodes/{id}/connections: + get: + tags: + - NetworkTopology + summary: Get router node connections + operationId: getRouterTopologyNodeConnections + security: + - authToken: [] + parameters: + - in: path + name: id + required: true + schema: + type: string + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/TopologyNodeConnectionsResponse" + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /network-topology/nats/nodes/{id}/connections: + get: + tags: + - NetworkTopology + summary: Get NATS node connections + operationId: getNatsTopologyNodeConnections + security: + - authToken: [] + parameters: + - in: path + name: id + required: true + schema: + type: string + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/TopologyNodeConnectionsResponse" + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /network-topology/router/connections: + get: + tags: + - NetworkTopology + summary: List router topology connections + operationId: listRouterTopologyConnections + security: + - authToken: [] + parameters: + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + - $ref: "#/components/parameters/NetworkTopologyOffsetParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/RouterTopologyConnectionListResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/nats/connections: + get: + tags: + - NetworkTopology + summary: List NATS topology connections + operationId: listNatsTopologyConnections + security: + - authToken: [] + parameters: + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + - $ref: "#/components/parameters/NetworkTopologyOffsetParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/NatsTopologyConnectionListResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "500": + description: Internal Server Error + /network-topology/router/subgraph: + get: + tags: + - NetworkTopology + summary: Get router topology subgraph + operationId: getRouterTopologySubgraph + security: + - authToken: [] + parameters: + - in: query + name: center + required: true + schema: + type: string + - in: query + name: depth + schema: + type: integer + minimum: 1 + maximum: 2 + default: 1 + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/TopologySubgraphResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /network-topology/nats/subgraph: + get: + tags: + - NetworkTopology + summary: Get NATS topology subgraph + operationId: getNatsTopologySubgraph + security: + - authToken: [] + parameters: + - in: query + name: center + required: true + schema: + type: string + - in: query + name: depth + schema: + type: integer + minimum: 1 + maximum: 2 + default: 1 + - $ref: "#/components/parameters/NetworkTopologyLimitParam" + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/TopologySubgraphResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error + /router: + get: + tags: + - Router + summary: Gets the default network router configuration + operationId: getDefaultRouter + security: + - authToken: [] + responses: + "200": + description: Success + headers: + X-Timestamp: + description: FogController server timestamp + schema: + type: number + content: + application/json: + schema: + $ref: "#/components/schemas/DefaultRouterResponse" + "401": + description: Not Authorized + "404": + description: Default router not found + "500": + description: Internal Server Error + put: + tags: + - Router + summary: Creates or updates the default network router + operationId: upsertDefaultRouter + security: + - authToken: [] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/DefaultRouterUpsertRequest" + responses: + "200": + description: Success + headers: + X-Timestamp: + description: FogController server timestamp + schema: + type: number + content: + application/json: + schema: + $ref: "#/components/schemas/DefaultRouterRecordResponse" + "400": + description: Bad Request + "401": + description: Not Authorized + "500": + description: Internal Server Error /serviceaccounts: get: tags: @@ -7056,6 +7456,10 @@ tags: description: Manage RBAC roles - name: RoleBindings description: Manage RBAC role bindings + - name: Router + description: Manage the default router configuration + - name: NetworkTopology + description: Read-only router and NATS network topology for visualization - name: ServiceAccounts description: Manage RBAC service accounts servers: @@ -7070,6 +7474,51 @@ components: type: http scheme: bearer description: Edgelet fog provisioning token (agent routes) + parameters: + NetworkTopologyLimitParam: + in: query + name: limit + required: false + schema: + type: integer + minimum: 1 + maximum: 500 + default: 100 + NetworkTopologyOffsetParam: + in: query + name: offset + required: false + schema: + type: integer + minimum: 0 + default: 0 + NetworkTopologyDeploymentTargetParam: + in: query + name: deploymentTarget + required: false + schema: + type: string + enum: [kubernetes, remote, edgelet] + NetworkTopologySearchParam: + in: query + name: search + required: false + schema: + type: string + NetworkTopologyRouterRoleParam: + in: query + name: role + required: false + schema: + type: string + enum: [default, edge, interior] + NetworkTopologyNatsRoleParam: + in: query + name: role + required: false + schema: + type: string + enum: [hub, leaf, server] responses: AuthRateLimitExceeded: description: Too many authentication requests from this IP address @@ -10419,4 +10868,298 @@ components: properties: host: type: string - additionalProperties: false \ No newline at end of file + additionalProperties: false + DefaultRouterResponse: + type: object + properties: + host: + type: string + messagingPort: + type: integer + edgeRouterPort: + type: integer + nullable: true + interRouterPort: + type: integer + nullable: true + required: + - host + - messagingPort + DefaultRouterUpsertRequest: + type: object + required: + - host + properties: + host: + type: string + messagingPort: + type: integer + minimum: 1 + maximum: 65535 + description: Defaults to 5671 when omitted + edgeRouterPort: + type: integer + minimum: 1 + maximum: 65535 + description: Defaults to 45671 when omitted + interRouterPort: + type: integer + minimum: 1 + maximum: 65535 + description: Defaults to 55671 when omitted + additionalProperties: true + DefaultRouterRecordResponse: + type: object + properties: + id: + type: integer + isEdge: + type: boolean + messagingPort: + type: integer + edgeRouterPort: + type: integer + nullable: true + interRouterPort: + type: integer + nullable: true + host: + type: string + isDefault: + type: boolean + iofogUuid: + type: string + nullable: true + createdAt: + type: string + format: date-time + updatedAt: + type: string + format: date-time + required: + - id + - isEdge + - messagingPort + - host + - isDefault + NetworkTopologyDeploymentTarget: + type: string + enum: [kubernetes, remote, edgelet] + NetworkTopologyConnection: + type: object + properties: + id: + type: integer + source: + type: string + dest: + type: string + required: [id, source, dest] + NetworkTopologyNodeBase: + type: object + properties: + id: + type: string + iofogUuid: + type: string + nullable: true + fogName: + type: string + nullable: true + host: + type: string + nullable: true + deploymentTarget: + $ref: "#/components/schemas/NetworkTopologyDeploymentTarget" + displayName: + type: string + role: + type: string + mode: + type: string + required: [id, deploymentTarget, displayName, role, mode] + RouterTopologyNode: + allOf: + - $ref: "#/components/schemas/NetworkTopologyNodeBase" + NatsTopologyNode: + allOf: + - $ref: "#/components/schemas/NetworkTopologyNodeBase" + RouterTopologyNodeDetail: + allOf: + - $ref: "#/components/schemas/RouterTopologyNode" + - type: object + properties: + messagingPort: + type: integer + edgeRouterPort: + type: integer + nullable: true + interRouterPort: + type: integer + nullable: true + isDefault: + type: boolean + NatsTopologyNodeDetail: + allOf: + - $ref: "#/components/schemas/NatsTopologyNode" + - type: object + properties: + serverPort: + type: integer + nullable: true + leafPort: + type: integer + nullable: true + clusterPort: + type: integer + nullable: true + mqttPort: + type: integer + nullable: true + httpPort: + type: integer + nullable: true + jsStorageSize: + type: string + nullable: true + jsMemoryStoreSize: + type: string + nullable: true + isHub: + type: boolean + NetworkTopologySummaryResponse: + type: object + properties: + controlPlane: + type: string + enum: [kubernetes, remote] + router: + type: object + properties: + totalNodes: + type: integer + totalConnections: + type: integer + byRole: + type: object + additionalProperties: + type: integer + nats: + type: object + properties: + totalNodes: + type: integer + totalConnections: + type: integer + byRole: + type: object + additionalProperties: + type: integer + TopologySpokeGroup: + type: object + properties: + upstreamOf: + type: string + role: + type: string + count: + type: integer + required: [upstreamOf, role, count] + RouterTopologyOverviewResponse: + type: object + properties: + defaultNode: + $ref: "#/components/schemas/RouterTopologyNode" + nullable: true + interiorNodes: + type: array + items: + $ref: "#/components/schemas/RouterTopologyNode" + spokeGroups: + type: array + items: + $ref: "#/components/schemas/TopologySpokeGroup" + NatsTopologyOverviewResponse: + type: object + properties: + defaultNode: + $ref: "#/components/schemas/NatsTopologyNode" + nullable: true + serverNodes: + type: array + items: + $ref: "#/components/schemas/NatsTopologyNode" + spokeGroups: + type: array + items: + $ref: "#/components/schemas/TopologySpokeGroup" + PaginatedListMeta: + type: object + properties: + total: + type: integer + limit: + type: integer + offset: + type: integer + required: [total, limit, offset] + RouterTopologyNodeListResponse: + allOf: + - $ref: "#/components/schemas/PaginatedListMeta" + - type: object + properties: + nodes: + type: array + items: + $ref: "#/components/schemas/RouterTopologyNode" + NatsTopologyNodeListResponse: + allOf: + - $ref: "#/components/schemas/PaginatedListMeta" + - type: object + properties: + nodes: + type: array + items: + $ref: "#/components/schemas/NatsTopologyNode" + RouterTopologyConnectionListResponse: + allOf: + - $ref: "#/components/schemas/PaginatedListMeta" + - type: object + properties: + connections: + type: array + items: + $ref: "#/components/schemas/NetworkTopologyConnection" + NatsTopologyConnectionListResponse: + allOf: + - $ref: "#/components/schemas/PaginatedListMeta" + - type: object + properties: + connections: + type: array + items: + $ref: "#/components/schemas/NetworkTopologyConnection" + TopologyNodeConnectionsResponse: + type: object + properties: + upstream: + type: array + items: + $ref: "#/components/schemas/NetworkTopologyConnection" + downstream: + type: array + items: + $ref: "#/components/schemas/NetworkTopologyConnection" + TopologySubgraphResponse: + type: object + properties: + nodes: + type: array + items: + oneOf: + - $ref: "#/components/schemas/RouterTopologyNode" + - $ref: "#/components/schemas/NatsTopologyNode" + connections: + type: array + items: + $ref: "#/components/schemas/NetworkTopologyConnection" \ No newline at end of file diff --git a/src/config/rbac-resources.yaml b/src/config/rbac-resources.yaml index 6c98ab84..515b1cad 100644 --- a/src/config/rbac-resources.yaml +++ b/src/config/rbac-resources.yaml @@ -248,6 +248,53 @@ resources: GET: [get] PUT: [update] + networkTopology: + basePath: /api/v3/network-topology + routes: + - path: /api/v3/network-topology/summary + methods: + GET: [list] + - path: /api/v3/network-topology/router/overview + methods: + GET: [list] + - path: /api/v3/network-topology/nats/overview + methods: + GET: [list] + - path: /api/v3/network-topology/router/nodes + methods: + GET: [list] + - path: /api/v3/network-topology/nats/nodes + methods: + GET: [list] + - path: /api/v3/network-topology/router/nodes/:id + methods: + GET: [get] + resourceNameParam: id + - path: /api/v3/network-topology/nats/nodes/:id + methods: + GET: [get] + resourceNameParam: id + - path: /api/v3/network-topology/router/nodes/:id/connections + methods: + GET: [get] + resourceNameParam: id + - path: /api/v3/network-topology/nats/nodes/:id/connections + methods: + GET: [get] + resourceNameParam: id + - path: /api/v3/network-topology/router/connections + methods: + GET: [list] + - path: /api/v3/network-topology/nats/connections + methods: + GET: [list] + - path: /api/v3/network-topology/router/subgraph + methods: + GET: [list] + - path: /api/v3/network-topology/nats/subgraph + methods: + GET: [list] + # NATS (granular resources) natsOperator: basePath: /api/v3/nats diff --git a/src/config/rbac-system-roles.js b/src/config/rbac-system-roles.js index e9051fea..3dbe4ee1 100644 --- a/src/config/rbac-system-roles.js +++ b/src/config/rbac-system-roles.js @@ -40,7 +40,7 @@ module.exports = { rules: [ { apiGroups: [''], - resources: ['microservices', 'systemMicroservices', 'fogs', 'applications', 'systemApplications', 'applicationTemplates', 'services', 'router', 'natsAccounts', 'natsUsers', 'natsAccountRules', 'natsUserRules', 'catalog', 'registries', 'secrets', 'configMaps', 'volumeMounts', 'tunnels', 'certificates', 'capabilities', 'cluster', 'serviceAccounts', 'events', 'users', 'authUsers', 'authGroups', 'config', 'controller', 'execSessions', 'systemExecSessions', 'logs', 'systemLogs'], + resources: ['microservices', 'systemMicroservices', 'fogs', 'applications', 'systemApplications', 'applicationTemplates', 'services', 'router', 'networkTopology', 'natsAccounts', 'natsUsers', 'natsAccountRules', 'natsUserRules', 'catalog', 'registries', 'secrets', 'configMaps', 'volumeMounts', 'tunnels', 'certificates', 'capabilities', 'cluster', 'serviceAccounts', 'events', 'users', 'authUsers', 'authGroups', 'config', 'controller', 'execSessions', 'systemExecSessions', 'logs', 'systemLogs'], verbs: ['*'] }, { @@ -67,7 +67,7 @@ module.exports = { }, { apiGroups: [''], - resources: ['fogs', 'router', 'tunnels', 'users', 'authUsers', 'authGroups', 'config', 'roles', 'roleBindings', 'systemMicroservices', 'systemApplications', 'systemExecSessions', 'systemLogs', 'cluster', 'natsOperator', 'natsBootstrap', 'natsHub'], + resources: ['fogs', 'router', 'networkTopology', 'tunnels', 'users', 'authUsers', 'authGroups', 'config', 'roles', 'roleBindings', 'systemMicroservices', 'systemApplications', 'systemExecSessions', 'systemLogs', 'cluster', 'natsOperator', 'natsBootstrap', 'natsHub'], verbs: ['get', 'list'] } ] @@ -84,7 +84,7 @@ module.exports = { rules: [ { apiGroups: [''], - resources: ['microservices', 'fogs', 'applications', 'systemMicroservices', 'systemApplications', 'applicationTemplates', 'services', 'router', 'natsOperator', 'natsBootstrap', 'natsHub', 'natsAccounts', 'natsUsers', 'natsAccountRules', 'natsUserRules', 'catalog', 'registries', 'secrets', 'configMaps', 'volumeMounts', 'certificates', 'capabilities', 'cluster', 'serviceAccounts', 'users', 'authUsers', 'authGroups', 'config', 'controller', 'roles', 'roleBindings'], + resources: ['microservices', 'fogs', 'applications', 'systemMicroservices', 'systemApplications', 'applicationTemplates', 'services', 'router', 'networkTopology', 'natsOperator', 'natsBootstrap', 'natsHub', 'natsAccounts', 'natsUsers', 'natsAccountRules', 'natsUserRules', 'catalog', 'registries', 'secrets', 'configMaps', 'volumeMounts', 'certificates', 'capabilities', 'cluster', 'serviceAccounts', 'users', 'authUsers', 'authGroups', 'config', 'controller', 'roles', 'roleBindings'], verbs: ['get', 'list'] } ] diff --git a/src/controllers/network-topology-controller.js b/src/controllers/network-topology-controller.js new file mode 100644 index 00000000..7feb5444 --- /dev/null +++ b/src/controllers/network-topology-controller.js @@ -0,0 +1,69 @@ +const NetworkTopologyService = require('../services/network-topology-service') + +const getSummaryEndPoint = async function (req) { + return NetworkTopologyService.getSummary(req) +} + +const getRouterOverviewEndPoint = async function (req) { + return NetworkTopologyService.getRouterOverview(req) +} + +const getNatsOverviewEndPoint = async function (req) { + return NetworkTopologyService.getNatsOverview(req) +} + +const listRouterNodesEndPoint = async function (req) { + return NetworkTopologyService.listRouterNodes(req) +} + +const listNatsNodesEndPoint = async function (req) { + return NetworkTopologyService.listNatsNodes(req) +} + +const getRouterNodeEndPoint = async function (req) { + return NetworkTopologyService.getRouterNode(req) +} + +const getNatsNodeEndPoint = async function (req) { + return NetworkTopologyService.getNatsNode(req) +} + +const getRouterNodeConnectionsEndPoint = async function (req) { + return NetworkTopologyService.getRouterNodeConnections(req) +} + +const getNatsNodeConnectionsEndPoint = async function (req) { + return NetworkTopologyService.getNatsNodeConnections(req) +} + +const listRouterConnectionsEndPoint = async function (req) { + return NetworkTopologyService.listRouterConnections(req) +} + +const listNatsConnectionsEndPoint = async function (req) { + return NetworkTopologyService.listNatsConnections(req) +} + +const getRouterSubgraphEndPoint = async function (req) { + return NetworkTopologyService.getRouterSubgraph(req) +} + +const getNatsSubgraphEndPoint = async function (req) { + return NetworkTopologyService.getNatsSubgraph(req) +} + +module.exports = { + getSummaryEndPoint, + getRouterOverviewEndPoint, + getNatsOverviewEndPoint, + listRouterNodesEndPoint, + listNatsNodesEndPoint, + getRouterNodeEndPoint, + getNatsNodeEndPoint, + getRouterNodeConnectionsEndPoint, + getNatsNodeConnectionsEndPoint, + listRouterConnectionsEndPoint, + listNatsConnectionsEndPoint, + getRouterSubgraphEndPoint, + getNatsSubgraphEndPoint +} diff --git a/src/routes/network-topology.js b/src/routes/network-topology.js new file mode 100644 index 00000000..e595fe5d --- /dev/null +++ b/src/routes/network-topology.js @@ -0,0 +1,84 @@ +const constants = require('../helpers/constants') +const NetworkTopologyController = require('../controllers/network-topology-controller') +const ResponseDecorator = require('../decorators/response-decorator') +const logger = require('../logger') +const Errors = require('../helpers/errors') +const rbacMiddleware = require('../lib/rbac/middleware') + +const defaultErrorCodes = [ + { + code: constants.HTTP_CODE_UNAUTHORIZED, + errors: [Errors.AuthenticationError] + }, + { + code: constants.HTTP_CODE_BAD_REQUEST, + errors: [Errors.ValidationError] + } +] + +const readWithNotFoundErrorCodes = [ + ...defaultErrorCodes, + { + code: constants.HTTP_CODE_NOT_FOUND, + errors: [Errors.NotFoundError] + } +] + +function createGetRoute (path, handler, errorCodes = defaultErrorCodes) { + return { + method: 'get', + path, + middleware: async (req, res) => { + logger.apiReq(req) + + await rbacMiddleware.protect()(req, res, async () => { + const endpoint = ResponseDecorator.handleErrors( + handler, + constants.HTTP_CODE_SUCCESS, + errorCodes + ) + const responseObject = await endpoint(req) + const user = req.kauth && req.kauth.grant && req.kauth.grant.access_token + ? req.kauth.grant.access_token.content.preferred_username + : 'system' + res + .status(responseObject.code) + .send(responseObject.body) + + logger.apiRes({ req, user, res, responseObject }) + }) + } + } +} + +module.exports = [ + createGetRoute('/api/v3/network-topology/summary', NetworkTopologyController.getSummaryEndPoint), + createGetRoute('/api/v3/network-topology/router/overview', NetworkTopologyController.getRouterOverviewEndPoint), + createGetRoute('/api/v3/network-topology/nats/overview', NetworkTopologyController.getNatsOverviewEndPoint), + createGetRoute('/api/v3/network-topology/router/nodes', NetworkTopologyController.listRouterNodesEndPoint), + createGetRoute('/api/v3/network-topology/nats/nodes', NetworkTopologyController.listNatsNodesEndPoint), + createGetRoute( + '/api/v3/network-topology/router/nodes/:id/connections', + NetworkTopologyController.getRouterNodeConnectionsEndPoint, + readWithNotFoundErrorCodes + ), + createGetRoute( + '/api/v3/network-topology/nats/nodes/:id/connections', + NetworkTopologyController.getNatsNodeConnectionsEndPoint, + readWithNotFoundErrorCodes + ), + createGetRoute( + '/api/v3/network-topology/router/nodes/:id', + NetworkTopologyController.getRouterNodeEndPoint, + readWithNotFoundErrorCodes + ), + createGetRoute( + '/api/v3/network-topology/nats/nodes/:id', + NetworkTopologyController.getNatsNodeEndPoint, + readWithNotFoundErrorCodes + ), + createGetRoute('/api/v3/network-topology/router/connections', NetworkTopologyController.listRouterConnectionsEndPoint), + createGetRoute('/api/v3/network-topology/nats/connections', NetworkTopologyController.listNatsConnectionsEndPoint), + createGetRoute('/api/v3/network-topology/router/subgraph', NetworkTopologyController.getRouterSubgraphEndPoint), + createGetRoute('/api/v3/network-topology/nats/subgraph', NetworkTopologyController.getNatsSubgraphEndPoint) +] diff --git a/src/schemas/network-topology.js b/src/schemas/network-topology.js new file mode 100644 index 00000000..346383ac --- /dev/null +++ b/src/schemas/network-topology.js @@ -0,0 +1,61 @@ +const networkTopologyListQuery = { + id: '/networkTopologyListQuery', + type: 'object', + properties: { + limit: { + anyOf: [ + { type: 'number', minimum: 1 }, + { type: 'string', pattern: '^\\d+$' } + ] + }, + offset: { + anyOf: [ + { type: 'number', minimum: 0 }, + { type: 'string', pattern: '^\\d+$' } + ] + }, + role: { + type: 'string', + enum: ['default', 'edge', 'interior', 'hub', 'leaf', 'server'] + }, + deploymentTarget: { + type: 'string', + enum: ['kubernetes', 'remote', 'edgelet'] + }, + search: { type: 'string', minLength: 1 } + }, + additionalProperties: false +} + +const networkTopologySubgraphQuery = { + id: '/networkTopologySubgraphQuery', + type: 'object', + properties: { + center: { type: 'string', minLength: 1 }, + depth: { + anyOf: [ + { type: 'number', minimum: 1, maximum: 2 }, + { type: 'string', pattern: '^[12]$' } + ] + }, + limit: { + anyOf: [ + { type: 'number', minimum: 1 }, + { type: 'string', pattern: '^\\d+$' } + ] + }, + offset: { + anyOf: [ + { type: 'number', minimum: 0 }, + { type: 'string', pattern: '^\\d+$' } + ] + } + }, + required: ['center'], + additionalProperties: false +} + +module.exports = { + mainSchemas: [networkTopologyListQuery, networkTopologySubgraphQuery], + innerSchemas: [] +} diff --git a/src/services/network-topology-service.js b/src/services/network-topology-service.js new file mode 100644 index 00000000..7a4d8968 --- /dev/null +++ b/src/services/network-topology-service.js @@ -0,0 +1,682 @@ +const config = require('../config') +const Constants = require('../helpers/constants') +const Errors = require('../helpers/errors') +const RouterManager = require('../data/managers/router-manager') +const RouterConnectionManager = require('../data/managers/router-connection-manager') +const NatsInstanceManager = require('../data/managers/nats-instance-manager') +const NatsConnectionManager = require('../data/managers/nats-connection-manager') +const FogManager = require('../data/managers/iofog-manager') +const TransactionDecorator = require('../decorators/transaction-decorator') +const Validator = require('../schemas') +const { Op } = require('sequelize') + +function _routerModel () { + return RouterManager.getEntity() +} + +function _routerConnectionModel () { + return RouterConnectionManager.getEntity() +} + +function _natsInstanceModel () { + return NatsInstanceManager.getEntity() +} + +function _natsConnectionModel () { + return NatsConnectionManager.getEntity() +} + +const DEFAULT_LIMIT = 100 +const MAX_LIMIT = 500 +const DEFAULT_SUBGRAPH_DEPTH = 1 +const MAX_SUBGRAPH_DEPTH = 2 +const MAX_SUBGRAPH_LIMIT = 200 + +function _isKubernetesControlPlane () { + const controlPlane = process.env.CONTROL_PLANE || config.get('app.ControlPlane') + return controlPlane && String(controlPlane).toLowerCase() === 'kubernetes' +} + +function _getControlPlane () { + return _isKubernetesControlPlane() ? 'kubernetes' : 'remote' +} + +function _parseLimitOffset (query) { + let limit = DEFAULT_LIMIT + if (query.limit !== undefined && query.limit !== null && query.limit !== '') { + const parsedLimit = parseInt(query.limit, 10) + if (!isNaN(parsedLimit) && parsedLimit > 0) { + limit = Math.min(parsedLimit, MAX_LIMIT) + } + } + + let offset = 0 + if (query.offset !== undefined && query.offset !== null && query.offset !== '') { + const parsedOffset = parseInt(query.offset, 10) + if (!isNaN(parsedOffset) && parsedOffset >= 0) { + offset = parsedOffset + } + } + + return { limit, offset } +} + +async function _validateListQuery (query) { + await Validator.validate(query || {}, Validator.schemas.networkTopologyListQuery) + return _parseLimitOffset(query || {}) +} + +async function _validateSubgraphQuery (query) { + await Validator.validate(query || {}, Validator.schemas.networkTopologySubgraphQuery) + const { limit } = _parseLimitOffset(query || {}) + let depth = DEFAULT_SUBGRAPH_DEPTH + if (query.depth !== undefined && query.depth !== null && query.depth !== '') { + const parsedDepth = parseInt(query.depth, 10) + if (!isNaN(parsedDepth) && parsedDepth > 0) { + depth = Math.min(parsedDepth, MAX_SUBGRAPH_DEPTH) + } + } + const nodeLimit = Math.min(limit, MAX_SUBGRAPH_LIMIT) + return { + center: query.center, + depth, + limit: nodeLimit + } +} + +function _getRouterNodeId (router, defaultRouter) { + return (defaultRouter && router.id === defaultRouter.id) + ? Constants.DEFAULT_ROUTER_NAME + : router.iofogUuid +} + +function _getNatsNodeId (nats, defaultHub) { + return (defaultHub && nats.id === defaultHub.id) + ? Constants.DEFAULT_NATS_HUB_NAME + : nats.iofogUuid +} + +function _routerDeploymentTarget (router, defaultRouter) { + if (defaultRouter && router.id === defaultRouter.id) { + return _isKubernetesControlPlane() ? 'kubernetes' : 'remote' + } + return 'edgelet' +} + +function _natsDeploymentTarget (nats, defaultHub) { + if (defaultHub && nats.id === defaultHub.id) { + return _isKubernetesControlPlane() ? 'kubernetes' : 'remote' + } + return 'edgelet' +} + +function _routerRole (router, defaultRouter) { + if (defaultRouter && router.id === defaultRouter.id) { + return 'default' + } + return router.isEdge ? 'edge' : 'interior' +} + +function _natsRole (nats, defaultHub) { + if (defaultHub && nats.id === defaultHub.id) { + return 'hub' + } + return nats.isLeaf ? 'leaf' : 'server' +} + +function _routerMode (router) { + return router.isEdge ? 'edge' : 'interior' +} + +function _natsMode (nats) { + return nats.isLeaf ? 'leaf' : 'server' +} + +function _routerDisplayName (router, defaultRouter, fog) { + const deploymentTarget = _routerDeploymentTarget(router, defaultRouter) + if (deploymentTarget === 'kubernetes') { + return 'Kubernetes Router' + } + if (deploymentTarget === 'remote') { + return 'Default Router' + } + return fog ? fog.name : router.iofogUuid +} + +function _natsDisplayName (nats, defaultHub, fog) { + const deploymentTarget = _natsDeploymentTarget(nats, defaultHub) + if (deploymentTarget === 'kubernetes') { + return 'Kubernetes NATS Hub' + } + if (deploymentTarget === 'remote') { + return 'Default NATS Hub' + } + return fog ? fog.name : nats.iofogUuid +} + +async function _loadFogMap (iofogUuids, transaction) { + const uuids = [...new Set((iofogUuids || []).filter(Boolean))] + if (!uuids.length) { + return new Map() + } + + const fogs = await FogManager.findAll({ uuid: { [Op.in]: uuids } }, transaction) + + return new Map(fogs.map((fog) => [fog.uuid, fog])) +} + +function _formatRouterListNode (router, defaultRouter, fogMap) { + const fog = router.iofogUuid ? fogMap.get(router.iofogUuid) : null + return { + id: _getRouterNodeId(router, defaultRouter), + iofogUuid: router.iofogUuid, + fogName: fog ? fog.name : null, + host: router.host || (fog ? fog.host : null) || null, + deploymentTarget: _routerDeploymentTarget(router, defaultRouter), + displayName: _routerDisplayName(router, defaultRouter, fog), + role: _routerRole(router, defaultRouter), + mode: _routerMode(router) + } +} + +function _formatRouterDetailNode (router, defaultRouter, fogMap) { + return { + ..._formatRouterListNode(router, defaultRouter, fogMap), + messagingPort: router.messagingPort, + edgeRouterPort: router.edgeRouterPort, + interRouterPort: router.interRouterPort, + isDefault: !!(defaultRouter && router.id === defaultRouter.id) + } +} + +function _formatNatsListNode (nats, defaultHub, fogMap) { + const fog = nats.iofogUuid ? fogMap.get(nats.iofogUuid) : null + return { + id: _getNatsNodeId(nats, defaultHub), + iofogUuid: nats.iofogUuid, + fogName: fog ? fog.name : null, + host: nats.host || (fog ? fog.host : null) || null, + deploymentTarget: _natsDeploymentTarget(nats, defaultHub), + displayName: _natsDisplayName(nats, defaultHub, fog), + role: _natsRole(nats, defaultHub), + mode: _natsMode(nats) + } +} + +function _formatNatsDetailNode (nats, defaultHub, fogMap) { + return { + ..._formatNatsListNode(nats, defaultHub, fogMap), + serverPort: nats.serverPort, + leafPort: nats.leafPort, + clusterPort: nats.clusterPort, + mqttPort: nats.mqttPort, + httpPort: nats.httpPort, + jsStorageSize: nats.jsStorageSize, + jsMemoryStoreSize: nats.jsMemoryStoreSize, + isHub: !!(defaultHub && nats.id === defaultHub.id) + } +} + +function _formatRouterConnection (connection, defaultRouter) { + return { + id: connection.id, + source: _getRouterNodeId(connection.source, defaultRouter), + dest: _getRouterNodeId(connection.dest, defaultRouter) + } +} + +function _formatNatsConnection (connection, defaultHub) { + return { + id: connection.id, + source: _getNatsNodeId(connection.source, defaultHub), + dest: _getNatsNodeId(connection.dest, defaultHub) + } +} + +async function _buildRouterWhere (query, transaction) { + const where = {} + + if (query.role === 'default') { + where.isDefault = true + } else if (query.role === 'edge') { + where.isEdge = true + where.isDefault = false + } else if (query.role === 'interior') { + where.isEdge = false + where.isDefault = false + } + + if (query.deploymentTarget === 'edgelet') { + where.iofogUuid = { [Op.ne]: null } + } else if (query.deploymentTarget === 'kubernetes') { + if (!_isKubernetesControlPlane()) { + return { where: { id: -1 }, empty: true } + } + where.isDefault = true + } else if (query.deploymentTarget === 'remote') { + if (_isKubernetesControlPlane()) { + return { where: { id: -1 }, empty: true } + } + where.isDefault = true + } + + if (query.search) { + const matchingFogs = await FogManager.findAll({ name: { [Op.like]: `${query.search}%` } }, transaction) + const uuids = matchingFogs.map((fog) => fog.uuid) + if (!_applyFogUuidSearchFilter(where, uuids)) { + return { where: { id: -1 }, empty: true } + } + } + + return { where, empty: false } +} + +function _applyFogUuidSearchFilter (where, uuids) { + if (!uuids.length) { + return null + } + where.iofogUuid = where.iofogUuid + ? { [Op.and]: [where.iofogUuid, { [Op.in]: uuids }] } + : { [Op.in]: uuids } + return where +} + +async function _buildNatsWhere (query, transaction) { + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const where = {} + + if (query.role === 'hub') { + where.isHub = true + } else if (query.role === 'leaf') { + where.isLeaf = true + where.isHub = false + } else if (query.role === 'server') { + where.isLeaf = false + where.isHub = false + } + + if (query.deploymentTarget === 'edgelet') { + where.iofogUuid = { [Op.ne]: null } + } else if (query.deploymentTarget === 'kubernetes') { + if (!_isKubernetesControlPlane() || !defaultHub) { + return { where: { id: -1 }, empty: true } + } + where.id = defaultHub.id + } else if (query.deploymentTarget === 'remote') { + if (_isKubernetesControlPlane() || !defaultHub) { + return { where: { id: -1 }, empty: true } + } + where.id = defaultHub.id + } + + if (query.search) { + const matchingFogs = await FogManager.findAll({ name: { [Op.like]: `${query.search}%` } }, transaction) + const uuids = matchingFogs.map((fog) => fog.uuid) + if (!_applyFogUuidSearchFilter(where, uuids)) { + return { where: { id: -1 }, empty: true } + } + } + + return { where, empty: false } +} + +async function _findRouterByNodeId (nodeId, transaction) { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (nodeId === Constants.DEFAULT_ROUTER_NAME) { + if (!defaultRouter) { + throw new Errors.NotFoundError(`Router node '${nodeId}' not found`) + } + return { router: defaultRouter, defaultRouter } + } + + const router = await RouterManager.findOne({ iofogUuid: nodeId }, transaction) + if (!router) { + throw new Errors.NotFoundError(`Router node '${nodeId}' not found`) + } + return { router, defaultRouter } +} + +async function _findNatsByNodeId (nodeId, transaction) { + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + if (nodeId === Constants.DEFAULT_NATS_HUB_NAME) { + if (!defaultHub) { + throw new Errors.NotFoundError(`NATS node '${nodeId}' not found`) + } + return { nats: defaultHub, defaultHub } + } + + const nats = await NatsInstanceManager.findOne({ iofogUuid: nodeId }, transaction) + if (!nats) { + throw new Errors.NotFoundError(`NATS node '${nodeId}' not found`) + } + return { nats, defaultHub } +} + +async function _countRouterRoles (transaction) { + const [defaultCount, edgeCount, interiorCount] = await Promise.all([ + _routerModel().count({ where: { isDefault: true }, transaction }), + _routerModel().count({ where: { isEdge: true, isDefault: false }, transaction }), + _routerModel().count({ where: { isEdge: false, isDefault: false }, transaction }) + ]) + return { default: defaultCount, edge: edgeCount, interior: interiorCount } +} + +async function _countNatsRoles (transaction) { + const [hubCount, leafCount, serverCount] = await Promise.all([ + _natsInstanceModel().count({ where: { isHub: true }, transaction }), + _natsInstanceModel().count({ where: { isLeaf: true, isHub: false }, transaction }), + _natsInstanceModel().count({ where: { isLeaf: false, isHub: false }, transaction }) + ]) + return { hub: hubCount, leaf: leafCount, server: serverCount } +} + +function _buildSpokeGroups (connections, getNodeId, getRole, defaultNodeId) { + const groups = new Map() + for (const connection of connections || []) { + const upstreamOf = getNodeId(connection.dest) + if (upstreamOf !== defaultNodeId) { + continue + } + const role = getRole(connection.source) + const key = `${upstreamOf}:${role}` + groups.set(key, (groups.get(key) || 0) + 1) + } + + return [...groups.entries()].map(([key, count]) => { + const [upstreamOf, role] = key.split(':') + return { upstreamOf, role, count } + }) +} + +async function getSummary (_req, transaction) { + const [routerTotalNodes, routerTotalConnections, natsTotalNodes, natsTotalConnections, routerByRole, natsByRole] = await Promise.all([ + _routerModel().count({ transaction }), + _routerConnectionModel().count({ transaction }), + _natsInstanceModel().count({ transaction }), + _natsConnectionModel().count({ transaction }), + _countRouterRoles(transaction), + _countNatsRoles(transaction) + ]) + + return { + controlPlane: _getControlPlane(), + router: { + totalNodes: routerTotalNodes, + totalConnections: routerTotalConnections, + byRole: routerByRole + }, + nats: { + totalNodes: natsTotalNodes, + totalConnections: natsTotalConnections, + byRole: natsByRole + } + } +} + +async function getRouterOverview (_req, transaction) { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const interiorRouters = await RouterManager.findAll({ isDefault: false, isEdge: false }, transaction) + const fogMap = await _loadFogMap( + interiorRouters.map((router) => router.iofogUuid).concat(defaultRouter ? [defaultRouter.iofogUuid] : []), + transaction + ) + + let spokeGroups = [] + if (defaultRouter) { + const connections = await RouterConnectionManager.findAllWithRouters({ destRouter: defaultRouter.id }, transaction) + spokeGroups = _buildSpokeGroups( + connections, + (router) => _getRouterNodeId(router, defaultRouter), + (router) => _routerRole(router, defaultRouter), + Constants.DEFAULT_ROUTER_NAME + ) + } + + return { + defaultNode: defaultRouter ? _formatRouterListNode(defaultRouter, defaultRouter, fogMap) : null, + interiorNodes: interiorRouters.map((router) => _formatRouterListNode(router, defaultRouter, fogMap)), + spokeGroups + } +} + +async function getNatsOverview (_req, transaction) { + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const serverNodes = await NatsInstanceManager.findAll({ isLeaf: false, isHub: false }, transaction) + const fogMap = await _loadFogMap( + serverNodes.map((nats) => nats.iofogUuid).concat(defaultHub ? [defaultHub.iofogUuid] : []), + transaction + ) + + let spokeGroups = [] + if (defaultHub) { + const connections = await NatsConnectionManager.findAllWithNats({ destNats: defaultHub.id }, transaction) + spokeGroups = _buildSpokeGroups( + connections, + (nats) => _getNatsNodeId(nats, defaultHub), + (nats) => _natsRole(nats, defaultHub), + Constants.DEFAULT_NATS_HUB_NAME + ) + } + + return { + defaultNode: defaultHub ? _formatNatsListNode(defaultHub, defaultHub, fogMap) : null, + serverNodes: serverNodes.map((nats) => _formatNatsListNode(nats, defaultHub, fogMap)), + spokeGroups + } +} + +async function listRouterNodes (req, transaction) { + const { limit, offset } = await _validateListQuery(req.query) + const { where, empty } = await _buildRouterWhere(req.query || {}, transaction) + if (empty) { + return { nodes: [], total: 0, limit, offset } + } + + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const { count, rows } = await _routerModel().findAndCountAll({ + where, + limit, + offset, + order: [['id', 'ASC']], + transaction + }) + + const fogMap = await _loadFogMap(rows.map((router) => router.iofogUuid), transaction) + return { + nodes: rows.map((router) => _formatRouterListNode(router, defaultRouter, fogMap)), + total: count, + limit, + offset + } +} + +async function listNatsNodes (req, transaction) { + const { limit, offset } = await _validateListQuery(req.query) + const { where, empty } = await _buildNatsWhere(req.query || {}, transaction) + if (empty) { + return { nodes: [], total: 0, limit, offset } + } + + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const { count, rows } = await _natsInstanceModel().findAndCountAll({ + where, + limit, + offset, + order: [['id', 'ASC']], + transaction + }) + + const fogMap = await _loadFogMap(rows.map((nats) => nats.iofogUuid), transaction) + return { + nodes: rows.map((nats) => _formatNatsListNode(nats, defaultHub, fogMap)), + total: count, + limit, + offset + } +} + +async function getRouterNode (req, transaction) { + const { router, defaultRouter } = await _findRouterByNodeId(req.params.id, transaction) + const fogMap = await _loadFogMap([router.iofogUuid], transaction) + return _formatRouterDetailNode(router, defaultRouter, fogMap) +} + +async function getNatsNode (req, transaction) { + const { nats, defaultHub } = await _findNatsByNodeId(req.params.id, transaction) + const fogMap = await _loadFogMap([nats.iofogUuid], transaction) + return _formatNatsDetailNode(nats, defaultHub, fogMap) +} + +async function getRouterNodeConnections (req, transaction) { + const { router, defaultRouter } = await _findRouterByNodeId(req.params.id, transaction) + const [upstreamConnections, downstreamConnections] = await Promise.all([ + RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction), + RouterConnectionManager.findAllWithRouters({ destRouter: router.id }, transaction) + ]) + + return { + upstream: (upstreamConnections || []).map((connection) => _formatRouterConnection(connection, defaultRouter)), + downstream: (downstreamConnections || []).map((connection) => _formatRouterConnection(connection, defaultRouter)) + } +} + +async function getNatsNodeConnections (req, transaction) { + const { nats, defaultHub } = await _findNatsByNodeId(req.params.id, transaction) + const [upstreamConnections, downstreamConnections] = await Promise.all([ + NatsConnectionManager.findAllWithNats({ sourceNats: nats.id }, transaction), + NatsConnectionManager.findAllWithNats({ destNats: nats.id }, transaction) + ]) + + return { + upstream: (upstreamConnections || []).map((connection) => _formatNatsConnection(connection, defaultHub)), + downstream: (downstreamConnections || []).map((connection) => _formatNatsConnection(connection, defaultHub)) + } +} + +async function listRouterConnections (req, transaction) { + const { limit, offset } = await _validateListQuery(req.query) + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const { count, rows } = await _routerConnectionModel().findAndCountAll({ + include: [ + { model: _routerModel(), as: 'source', required: true }, + { model: _routerModel(), as: 'dest', required: true } + ], + limit, + offset, + order: [['id', 'ASC']], + transaction + }) + + return { + connections: rows.map((connection) => _formatRouterConnection(connection, defaultRouter)), + total: count, + limit, + offset + } +} + +async function listNatsConnections (req, transaction) { + const { limit, offset } = await _validateListQuery(req.query) + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const { count, rows } = await _natsConnectionModel().findAndCountAll({ + include: [ + { model: _natsInstanceModel(), as: 'source', required: true }, + { model: _natsInstanceModel(), as: 'dest', required: true } + ], + limit, + offset, + order: [['id', 'ASC']], + transaction + }) + + return { + connections: rows.map((connection) => _formatNatsConnection(connection, defaultHub)), + total: count, + limit, + offset + } +} + +async function _buildSubgraph (layer, centerId, depth, nodeLimit, transaction) { + const isRouter = layer === 'router' + const findByNodeId = isRouter ? _findRouterByNodeId : _findNatsByNodeId + const connectionManager = isRouter ? RouterConnectionManager : NatsConnectionManager + const sourceField = isRouter ? 'sourceRouter' : 'sourceNats' + const destField = isRouter ? 'destRouter' : 'destNats' + const findAllWith = isRouter ? 'findAllWithRouters' : 'findAllWithNats' + const formatListNode = isRouter ? _formatRouterListNode : _formatNatsListNode + const formatConnection = isRouter ? _formatRouterConnection : _formatNatsConnection + + const centerLookup = await findByNodeId(centerId, transaction) + const anchor = isRouter ? centerLookup.router : centerLookup.nats + const defaultAnchor = isRouter ? centerLookup.defaultRouter : centerLookup.defaultHub + + const nodeRecords = new Map([[anchor.id, anchor]]) + const connectionRecords = new Map() + let frontierIds = new Set([anchor.id]) + + for (let hop = 0; hop < depth; hop++) { + const nextFrontier = new Set() + for (const nodeId of frontierIds) { + const [upstream, downstream] = await Promise.all([ + connectionManager[findAllWith]({ [sourceField]: nodeId }, transaction), + connectionManager[findAllWith]({ [destField]: nodeId }, transaction) + ]) + + for (const connection of [...(upstream || []), ...(downstream || [])]) { + connectionRecords.set(connection.id, connection) + nodeRecords.set(connection.source.id, connection.source) + nodeRecords.set(connection.dest.id, connection.dest) + if (connection.source.id !== nodeId) { + nextFrontier.add(connection.source.id) + } + if (connection.dest.id !== nodeId) { + nextFrontier.add(connection.dest.id) + } + } + } + frontierIds = nextFrontier + if (nodeRecords.size >= nodeLimit) { + break + } + } + + const limitedNodes = [...nodeRecords.values()].slice(0, nodeLimit) + const limitedNodeIds = new Set(limitedNodes.map((node) => node.id)) + const fogMap = await _loadFogMap(limitedNodes.map((node) => node.iofogUuid), transaction) + + const connections = [...connectionRecords.values()] + .filter((connection) => limitedNodeIds.has(connection.source.id) && limitedNodeIds.has(connection.dest.id)) + .map((connection) => formatConnection(connection, defaultAnchor)) + + return { + nodes: limitedNodes.map((node) => formatListNode(node, defaultAnchor, fogMap)), + connections + } +} + +async function getRouterSubgraph (req, transaction) { + const { center, depth, limit } = await _validateSubgraphQuery(req.query) + return _buildSubgraph('router', center, depth, limit, transaction) +} + +async function getNatsSubgraph (req, transaction) { + const { center, depth, limit } = await _validateSubgraphQuery(req.query) + return _buildSubgraph('nats', center, depth, limit, transaction) +} + +module.exports = { + getSummary: TransactionDecorator.generateTransaction(getSummary), + getRouterOverview: TransactionDecorator.generateTransaction(getRouterOverview), + getNatsOverview: TransactionDecorator.generateTransaction(getNatsOverview), + listRouterNodes: TransactionDecorator.generateTransaction(listRouterNodes), + listNatsNodes: TransactionDecorator.generateTransaction(listNatsNodes), + getRouterNode: TransactionDecorator.generateTransaction(getRouterNode), + getNatsNode: TransactionDecorator.generateTransaction(getNatsNode), + getRouterNodeConnections: TransactionDecorator.generateTransaction(getRouterNodeConnections), + getNatsNodeConnections: TransactionDecorator.generateTransaction(getNatsNodeConnections), + listRouterConnections: TransactionDecorator.generateTransaction(listRouterConnections), + listNatsConnections: TransactionDecorator.generateTransaction(listNatsConnections), + getRouterSubgraph: TransactionDecorator.generateTransaction(getRouterSubgraph), + getNatsSubgraph: TransactionDecorator.generateTransaction(getNatsSubgraph) +} diff --git a/test/src/controllers/network-topology-controller.test.js b/test/src/controllers/network-topology-controller.test.js new file mode 100644 index 00000000..f13701b1 --- /dev/null +++ b/test/src/controllers/network-topology-controller.test.js @@ -0,0 +1,27 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const NetworkTopologyController = require('../../../src/controllers/network-topology-controller') +const NetworkTopologyService = require('../../../src/services/network-topology-service') + +describe('Network Topology Controller', () => { + def('subject', () => NetworkTopologyController) + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => $sandbox.restore()) + + it('getSummaryEndPoint delegates to service', async () => { + const summary = { controlPlane: 'remote' } + $sandbox.stub(NetworkTopologyService, 'getSummary').resolves(summary) + await expect($subject.getSummaryEndPoint({})).to.eventually.eql(summary) + expect(NetworkTopologyService.getSummary).to.have.been.calledOnce + }) + + it('listRouterNodesEndPoint delegates to service', async () => { + const req = { query: { limit: '10' } } + const payload = { nodes: [], total: 0, limit: 10, offset: 0 } + $sandbox.stub(NetworkTopologyService, 'listRouterNodes').resolves(payload) + await expect($subject.listRouterNodesEndPoint(req)).to.eventually.eql(payload) + expect(NetworkTopologyService.listRouterNodes).to.have.been.calledOnceWith(req) + }) +}) diff --git a/test/src/services/network-topology-service.test.js b/test/src/services/network-topology-service.test.js new file mode 100644 index 00000000..e23d3b18 --- /dev/null +++ b/test/src/services/network-topology-service.test.js @@ -0,0 +1,293 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const constants = require('../../../src/helpers/constants') +const Errors = require('../../../src/helpers/errors') +const RouterManager = require('../../../src/data/managers/router-manager') +const RouterConnectionManager = require('../../../src/data/managers/router-connection-manager') +const NatsInstanceManager = require('../../../src/data/managers/nats-instance-manager') +const NatsConnectionManager = require('../../../src/data/managers/nats-connection-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const NetworkTopologyService = require('../../../src/services/network-topology-service') + +describe('Network Topology Service', () => { + const transaction = {} + const originalControlPlane = process.env.CONTROL_PLANE + + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => { + $sandbox.restore() + if (originalControlPlane === undefined) { + delete process.env.CONTROL_PLANE + } else { + process.env.CONTROL_PLANE = originalControlPlane + } + }) + + function stubFogFindAll (rows = []) { + $sandbox.stub(FogManager, 'findAll').resolves(rows) + } + + function makeRouter (overrides = {}) { + return { + id: 1, + isEdge: true, + isDefault: false, + iofogUuid: 'edge-uuid', + host: '10.0.0.1', + messagingPort: 5671, + edgeRouterPort: null, + interRouterPort: null, + ...overrides + } + } + + function makeDefaultRouter () { + return makeRouter({ + id: 99, + isEdge: false, + isDefault: true, + iofogUuid: null, + host: 'router.local', + edgeRouterPort: 45671, + interRouterPort: 55671 + }) + } + + function makeNats (overrides = {}) { + return { + id: 2, + isLeaf: true, + isHub: false, + iofogUuid: 'edge-uuid', + host: '10.0.0.1', + serverPort: 4222, + leafPort: 7422, + clusterPort: 6222, + mqttPort: 8883, + httpPort: 8222, + jsStorageSize: null, + jsMemoryStoreSize: null, + ...overrides + } + } + + describe('getSummary()', () => { + beforeEach(() => { + process.env.CONTROL_PLANE = 'remote' + $sandbox.stub(RouterManager, 'getEntity').returns({ + count: $sandbox.stub().resolves(3) + }) + $sandbox.stub(RouterConnectionManager, 'getEntity').returns({ + count: $sandbox.stub().resolves(2) + }) + $sandbox.stub(NatsInstanceManager, 'getEntity').returns({ + count: $sandbox.stub().resolves(3) + }) + $sandbox.stub(NatsConnectionManager, 'getEntity').returns({ + count: $sandbox.stub().resolves(2) + }) + }) + + it('returns control plane and counts', async () => { + const result = await NetworkTopologyService.getSummary({}, transaction) + expect(result.controlPlane).to.equal('remote') + expect(result.router.totalNodes).to.equal(3) + expect(result.router.totalConnections).to.equal(2) + expect(result.nats.totalNodes).to.equal(3) + expect(result.nats.totalConnections).to.equal(2) + }) + }) + + describe('listRouterNodes()', () => { + beforeEach(() => { + process.env.CONTROL_PLANE = 'remote' + $sandbox.stub(RouterManager, 'findOne').resolves(makeDefaultRouter()) + stubFogFindAll([{ uuid: 'edge-uuid', name: 'edge-1', host: '10.0.0.1' }]) + $sandbox.stub(RouterManager, 'getEntity').returns({ + findAndCountAll: $sandbox.stub().resolves({ + count: 1, + rows: [makeRouter()] + }) + }) + }) + + it('returns paginated router nodes with fog metadata', async () => { + const result = await NetworkTopologyService.listRouterNodes({ query: { limit: '10', offset: '0' } }, transaction) + expect(result.total).to.equal(1) + expect(result.nodes).to.have.length(1) + expect(result.nodes[0]).to.include({ + id: 'edge-uuid', + iofogUuid: 'edge-uuid', + fogName: 'edge-1', + host: '10.0.0.1', + deploymentTarget: 'edgelet', + displayName: 'edge-1', + role: 'edge', + mode: 'edge' + }) + }) + }) + + describe('getRouterNode()', () => { + beforeEach(() => { + process.env.CONTROL_PLANE = 'kubernetes' + $sandbox.stub(RouterManager, 'findOne') + .onFirstCall().resolves(makeDefaultRouter()) + stubFogFindAll([]) + }) + + it('returns default router detail with kubernetes deployment target', async () => { + const result = await NetworkTopologyService.getRouterNode({ + params: { id: constants.DEFAULT_ROUTER_NAME } + }, transaction) + + expect(result.id).to.equal(constants.DEFAULT_ROUTER_NAME) + expect(result.deploymentTarget).to.equal('kubernetes') + expect(result.displayName).to.equal('Kubernetes Router') + expect(result.isDefault).to.equal(true) + }) + + it('throws when router node is missing', async () => { + RouterManager.findOne.reset() + RouterManager.findOne.resolves(null) + await expect(NetworkTopologyService.getRouterNode({ + params: { id: constants.DEFAULT_ROUTER_NAME } + }, transaction)).to.be.rejectedWith(Errors.NotFoundError) + }) + }) + + describe('getRouterNodeConnections()', () => { + beforeEach(() => { + const defaultRouter = makeDefaultRouter() + const edgeRouter = makeRouter() + $sandbox.stub(RouterManager, 'findOne') + .onFirstCall().resolves(defaultRouter) + .onSecondCall().resolves(edgeRouter) + $sandbox.stub(RouterConnectionManager, 'findAllWithRouters') + .onFirstCall().resolves([{ + id: 7, + source: edgeRouter, + dest: defaultRouter + }]) + .onSecondCall().resolves([]) + }) + + it('returns upstream and downstream connections', async () => { + const result = await NetworkTopologyService.getRouterNodeConnections({ + params: { id: 'edge-uuid' } + }, transaction) + + expect(result.upstream).to.eql([{ + id: 7, + source: 'edge-uuid', + dest: constants.DEFAULT_ROUTER_NAME + }]) + expect(result.downstream).to.eql([]) + }) + }) + + describe('listRouterConnections()', () => { + beforeEach(() => { + const defaultRouter = makeDefaultRouter() + const edgeRouter = makeRouter() + $sandbox.stub(RouterManager, 'findOne').resolves(defaultRouter) + $sandbox.stub(RouterManager, 'getEntity').returns({}) + $sandbox.stub(RouterConnectionManager, 'getEntity').returns({ + findAndCountAll: $sandbox.stub().resolves({ + count: 1, + rows: [{ + id: 7, + source: edgeRouter, + dest: defaultRouter + }] + }) + }) + }) + + it('returns paginated formatted connections', async () => { + const result = await NetworkTopologyService.listRouterConnections({ query: {} }, transaction) + expect(result.connections).to.eql([{ + id: 7, + source: 'edge-uuid', + dest: constants.DEFAULT_ROUTER_NAME + }]) + }) + }) + + describe('getRouterOverview()', () => { + beforeEach(() => { + process.env.CONTROL_PLANE = 'remote' + const defaultRouter = makeDefaultRouter() + const edgeRouter = makeRouter() + $sandbox.stub(RouterManager, 'findOne').resolves(defaultRouter) + $sandbox.stub(RouterManager, 'findAll').resolves([]) + $sandbox.stub(RouterConnectionManager, 'findAllWithRouters').resolves([{ + id: 7, + source: edgeRouter, + dest: defaultRouter + }]) + stubFogFindAll([]) + }) + + it('returns default node and spoke groups', async () => { + const result = await NetworkTopologyService.getRouterOverview({}, transaction) + expect(result.defaultNode.id).to.equal(constants.DEFAULT_ROUTER_NAME) + expect(result.spokeGroups).to.eql([{ + upstreamOf: constants.DEFAULT_ROUTER_NAME, + role: 'edge', + count: 1 + }]) + }) + }) + + describe('getRouterSubgraph()', () => { + beforeEach(() => { + const defaultRouter = makeDefaultRouter() + const edgeRouter = makeRouter() + $sandbox.stub(RouterManager, 'findOne') + .onFirstCall().resolves(defaultRouter) + .onSecondCall().resolves(edgeRouter) + $sandbox.stub(RouterConnectionManager, 'findAllWithRouters') + .onFirstCall().resolves([{ id: 7, source: edgeRouter, dest: defaultRouter }]) + .onSecondCall().resolves([]) + stubFogFindAll([{ uuid: 'edge-uuid', name: 'edge-1', host: '10.0.0.1' }]) + }) + + it('requires center query parameter', async () => { + await expect(NetworkTopologyService.getRouterSubgraph({ query: {} }, transaction)) + .to.be.rejectedWith(Errors.ValidationError) + }) + + it('returns nodes and connections around center', async () => { + const result = await NetworkTopologyService.getRouterSubgraph({ + query: { center: constants.DEFAULT_ROUTER_NAME, depth: '1' } + }, transaction) + + expect(result.nodes.map((node) => node.id)).to.include(constants.DEFAULT_ROUTER_NAME) + expect(result.connections).to.have.length(1) + }) + }) + + describe('listNatsNodes()', () => { + beforeEach(() => { + process.env.CONTROL_PLANE = 'remote' + const defaultHub = makeNats({ id: 50, isLeaf: false, isHub: true, iofogUuid: null, host: 'nats.local' }) + $sandbox.stub(NatsInstanceManager, 'findOne').resolves(defaultHub) + stubFogFindAll([]) + $sandbox.stub(NatsInstanceManager, 'getEntity').returns({ + findAndCountAll: $sandbox.stub().resolves({ + count: 1, + rows: [defaultHub] + }) + }) + }) + + it('returns default hub node id', async () => { + const result = await NetworkTopologyService.listNatsNodes({ query: {} }, transaction) + expect(result.nodes[0].id).to.equal(constants.DEFAULT_NATS_HUB_NAME) + expect(result.nodes[0].deploymentTarget).to.equal('remote') + }) + }) +}) From 85d0fab06c0d7d7320404c10e87947408bf72d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 15:56:31 +0300 Subject: [PATCH 28/32] Stop passing false as the cluster controller service transaction argument. Let TransactionDecorator manage transactions instead of bypassing them from HTTP handlers. --- src/controllers/cluster-controller.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/controllers/cluster-controller.js b/src/controllers/cluster-controller.js index 75277634..847eb825 100644 --- a/src/controllers/cluster-controller.js +++ b/src/controllers/cluster-controller.js @@ -1,23 +1,23 @@ const ClusterControllerService = require('../services/cluster-controller-service') const listClusterControllersEndPoint = async function (req) { - return ClusterControllerService.listClusterControllers(false) + return ClusterControllerService.listClusterControllers() } const getClusterControllerEndPoint = async function (req) { const uuid = req.params.uuid - return ClusterControllerService.getClusterController(uuid, false) + return ClusterControllerService.getClusterController(uuid) } const updateClusterControllerEndPoint = async function (req) { const uuid = req.params.uuid const data = req.body - return ClusterControllerService.updateClusterController(uuid, data, false) + return ClusterControllerService.updateClusterController(uuid, data) } const deleteClusterControllerEndPoint = async function (req) { const uuid = req.params.uuid - return ClusterControllerService.deleteClusterController(uuid, false) + return ClusterControllerService.deleteClusterController(uuid) } module.exports = { From cf702af66eb2e730c9c25d15b74d0941982adead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Fri, 3 Jul 2026 15:56:44 +0300 Subject: [PATCH 29/32] Bump embedded EdgeOps Console default version to v1.0.7. Align local dev, Makefile, Dockerfile, and CI build defaults with the new console release. --- .env.example | 2 +- .github/actions/set-build-env/action.yml | 2 +- Dockerfile | 4 ++-- Makefile | 4 ++-- scripts/build-console-dev.js | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index 7639b55f..a9970e67 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ NODE_ENV=development # EdgeOps Console static embed (npm run build:console → dev/console/build) EDGEOPS_CONSOLE_PATH=dev/console/build # must be absolute path -EDGEOPS_CONSOLE_VERSION=v1.0.6 +EDGEOPS_CONSOLE_VERSION=v1.0.7 # EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=datasance diff --git a/.github/actions/set-build-env/action.yml b/.github/actions/set-build-env/action.yml index 41efa7ef..3413948a 100644 --- a/.github/actions/set-build-env/action.yml +++ b/.github/actions/set-build-env/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | VERSION="${{ env.EDGEOPS_CONSOLE_VERSION }}" - if [ -z "$VERSION" ]; then VERSION="1.0.6"; fi + if [ -z "$VERSION" ]; then VERSION="1.0.7"; fi echo "EDGEOPS_CONSOLE_VERSION=$VERSION" >> "${GITHUB_ENV}" REPO="${{ env.EDGEOPS_CONSOLE_REPO }}" diff --git a/Dockerfile b/Dockerfile index a06cd222..0d7581eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM node:24-bookworm@sha256:fdddfb3e688158251943d52eba361de991548f6814007acba4917ae6b512d6be AS console-builder ARG EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console -ARG EDGEOPS_CONSOLE_VERSION=v1.0.6 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.7 ARG EDGEOPS_CONSOLE_FLAVOR=datasance RUN apt-get update \ @@ -50,7 +50,7 @@ RUN npm pack # ubi9/nodejs-24-minimal:latest — pin manifest list digest for reproducible multi-arch builds FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:cc7648f8e1c7d628e4334328a712f30ea0820787bb92836cc93e349674c689bf -ARG EDGEOPS_CONSOLE_VERSION=v1.0.6 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.7 ARG IMAGE_REGISTRY ARG OCI_SOURCE_REPO ARG CONTROLLER_DISTRIBUTION=iofog diff --git a/Makefile b/Makefile index 8c275fc2..47ffe50c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Local Docker build — mirrors CI/release build-args (see .github/actions/set-build-env). -# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.6 +# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.7 FLAVOR ?= datasance IMAGE_NAME ?= controller @@ -25,7 +25,7 @@ else $(error FLAVOR must be "datasance" or "iofog", got "$(FLAVOR)") endif -EDGEOPS_CONSOLE_VERSION ?= v1.0.6 +EDGEOPS_CONSOLE_VERSION ?= v1.0.7 IMAGE_REF = $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(DOCKER_TAG) diff --git a/scripts/build-console-dev.js b/scripts/build-console-dev.js index 09d8cf61..2b6bde1a 100644 --- a/scripts/build-console-dev.js +++ b/scripts/build-console-dev.js @@ -9,7 +9,7 @@ const CONSOLE_DIR = path.join(DEV_DIR, 'console') const BUILD_OUT = path.join(CONSOLE_DIR, 'build') const REPO = process.env.EDGEOPS_CONSOLE_REPO || 'https://github.com/Datasance/edgeops-console' -const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.6' +const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.7' const FLAVOR = process.env.EDGEOPS_CONSOLE_FLAVOR || 'datasance' function normalizeTag (version) { From b8d9955e9ac13c37c88fe8115441245fa789b62a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Sat, 4 Jul 2026 00:42:33 +0300 Subject: [PATCH 30/32] Filter inactive cluster controllers from list API by default. Add includeInactive query parameter to return historical replica rows when needed. --- docs/swagger.yaml | 12 +++- src/controllers/cluster-controller.js | 4 +- src/services/cluster-controller-service.js | 5 +- .../cluster-controller-service.test.js | 59 +++++++++++++++++++ 4 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 test/src/services/cluster-controller-service.test.js diff --git a/docs/swagger.yaml b/docs/swagger.yaml index ef1e24ea..0fd14abc 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -726,10 +726,20 @@ paths: get: tags: - Cluster - summary: Lists all cluster controllers + summary: Lists cluster controllers (active replicas by default) operationId: listClusterControllers security: - authToken: [] + parameters: + - in: query + name: includeInactive + description: >- + When true, include inactive controller rows (historical replicas from + past rollouts). Default false returns only active replicas. + required: false + schema: + type: boolean + default: false responses: "200": description: Success diff --git a/src/controllers/cluster-controller.js b/src/controllers/cluster-controller.js index 847eb825..1700e3da 100644 --- a/src/controllers/cluster-controller.js +++ b/src/controllers/cluster-controller.js @@ -1,7 +1,9 @@ const ClusterControllerService = require('../services/cluster-controller-service') +const { parseBoolean } = require('../config/parse-boolean') const listClusterControllersEndPoint = async function (req) { - return ClusterControllerService.listClusterControllers() + const includeInactive = parseBoolean(req.query && req.query.includeInactive, false) + return ClusterControllerService.listClusterControllers(includeInactive) } const getClusterControllerEndPoint = async function (req) { diff --git a/src/services/cluster-controller-service.js b/src/services/cluster-controller-service.js index 39b4b7e6..0ee792f3 100644 --- a/src/services/cluster-controller-service.js +++ b/src/services/cluster-controller-service.js @@ -85,8 +85,9 @@ async function updateHeartbeat (uuid, transaction) { ) } -async function listClusterControllers (transaction) { - const controllers = await ClusterControllerManager.findAll({}, transaction) +async function listClusterControllers (includeInactive, transaction) { + const where = includeInactive ? {} : { isActive: true } + const controllers = await ClusterControllerManager.findAll(where, transaction) return controllers.map(controller => ({ uuid: controller.uuid, host: controller.host, diff --git a/test/src/services/cluster-controller-service.test.js b/test/src/services/cluster-controller-service.test.js new file mode 100644 index 00000000..3eb7e8ef --- /dev/null +++ b/test/src/services/cluster-controller-service.test.js @@ -0,0 +1,59 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ClusterControllerService = require('../../../src/services/cluster-controller-service') +const ClusterControllerManager = require('../../../src/data/managers/cluster-controller-manager') + +describe('Cluster Controller Service', () => { + def('sandbox', () => sinon.createSandbox()) + afterEach(() => $sandbox.restore()) + + describe('.listClusterControllers()', () => { + const transaction = {} + const activeRow = { + uuid: 'active-uuid', + host: 'controller-pod-a', + processId: 1, + lastHeartbeat: new Date(), + isActive: true, + createdAt: new Date(), + updatedAt: new Date() + } + const inactiveRow = { + uuid: 'inactive-uuid', + host: 'controller-pod-b', + processId: 1, + lastHeartbeat: new Date(), + isActive: false, + createdAt: new Date(), + updatedAt: new Date() + } + + beforeEach(() => { + $sandbox.stub(ClusterControllerManager, 'findAll').callsFake((where) => { + const rows = [activeRow, inactiveRow] + if (where && where.isActive === true) { + return Promise.resolve(rows.filter((row) => row.isActive)) + } + return Promise.resolve(rows) + }) + }) + + it('returns only active controllers by default', async () => { + const result = await ClusterControllerService.listClusterControllers(false, transaction) + + expect(ClusterControllerManager.findAll).to.have.been.calledWith({ isActive: true }, transaction) + expect(result).to.have.length(1) + expect(result[0].uuid).to.equal('active-uuid') + expect(result[0].isActive).to.equal(true) + }) + + it('returns all controllers when includeInactive is true', async () => { + const result = await ClusterControllerService.listClusterControllers(true, transaction) + + expect(ClusterControllerManager.findAll).to.have.been.calledWith({}, transaction) + expect(result).to.have.length(2) + expect(result.map((row) => row.uuid)).to.deep.equal(['active-uuid', 'inactive-uuid']) + }) + }) +}) From 0ed285e5a353f8693f20bf570fa86f773c5f473d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Sat, 4 Jul 2026 00:42:41 +0300 Subject: [PATCH 31/32] Bump embedded EdgeOps Console default version to v1.0.8. Refresh UBI nodejs-24-minimal manifest-list digest pin for reproducible multi-arch builds. --- .env.example | 2 +- .github/actions/set-build-env/action.yml | 2 +- Dockerfile | 6 +++--- Makefile | 4 ++-- scripts/build-console-dev.js | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.env.example b/.env.example index a9970e67..02da5b63 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ NODE_ENV=development # EdgeOps Console static embed (npm run build:console → dev/console/build) EDGEOPS_CONSOLE_PATH=dev/console/build # must be absolute path -EDGEOPS_CONSOLE_VERSION=v1.0.7 +EDGEOPS_CONSOLE_VERSION=v1.0.8 # EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=datasance diff --git a/.github/actions/set-build-env/action.yml b/.github/actions/set-build-env/action.yml index 3413948a..5d0cfee2 100644 --- a/.github/actions/set-build-env/action.yml +++ b/.github/actions/set-build-env/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | VERSION="${{ env.EDGEOPS_CONSOLE_VERSION }}" - if [ -z "$VERSION" ]; then VERSION="1.0.7"; fi + if [ -z "$VERSION" ]; then VERSION="1.0.8"; fi echo "EDGEOPS_CONSOLE_VERSION=$VERSION" >> "${GITHUB_ENV}" REPO="${{ env.EDGEOPS_CONSOLE_REPO }}" diff --git a/Dockerfile b/Dockerfile index 0d7581eb..c3232d23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM node:24-bookworm@sha256:fdddfb3e688158251943d52eba361de991548f6814007acba4917ae6b512d6be AS console-builder ARG EDGEOPS_CONSOLE_REPO=https://github.com/Datasance/edgeops-console -ARG EDGEOPS_CONSOLE_VERSION=v1.0.7 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.8 ARG EDGEOPS_CONSOLE_FLAVOR=datasance RUN apt-get update \ @@ -48,9 +48,9 @@ RUN npm pack # ubi9/nodejs-24-minimal:latest — pin manifest list digest for reproducible multi-arch builds -FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:cc7648f8e1c7d628e4334328a712f30ea0820787bb92836cc93e349674c689bf +FROM registry.access.redhat.com/ubi9/nodejs-24-minimal@sha256:5f1ac8eab93c93eb2227f4ee7822668b312ee292d122dddd580bee8f17359c2f -ARG EDGEOPS_CONSOLE_VERSION=v1.0.7 +ARG EDGEOPS_CONSOLE_VERSION=v1.0.8 ARG IMAGE_REGISTRY ARG OCI_SOURCE_REPO ARG CONTROLLER_DISTRIBUTION=iofog diff --git a/Makefile b/Makefile index 47ffe50c..c02f42b7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Local Docker build — mirrors CI/release build-args (see .github/actions/set-build-env). -# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.7 +# Override any variable: make build FLAVOR=iofog EDGEOPS_CONSOLE_VERSION=v1.0.8 FLAVOR ?= datasance IMAGE_NAME ?= controller @@ -25,7 +25,7 @@ else $(error FLAVOR must be "datasance" or "iofog", got "$(FLAVOR)") endif -EDGEOPS_CONSOLE_VERSION ?= v1.0.7 +EDGEOPS_CONSOLE_VERSION ?= v1.0.8 IMAGE_REF = $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(DOCKER_TAG) diff --git a/scripts/build-console-dev.js b/scripts/build-console-dev.js index 2b6bde1a..45235b97 100644 --- a/scripts/build-console-dev.js +++ b/scripts/build-console-dev.js @@ -9,7 +9,7 @@ const CONSOLE_DIR = path.join(DEV_DIR, 'console') const BUILD_OUT = path.join(CONSOLE_DIR, 'build') const REPO = process.env.EDGEOPS_CONSOLE_REPO || 'https://github.com/Datasance/edgeops-console' -const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.7' +const VERSION = process.env.EDGEOPS_CONSOLE_VERSION || 'v1.0.8' const FLAVOR = process.env.EDGEOPS_CONSOLE_FLAVOR || 'datasance' function normalizeTag (version) { From 7ed0b2cf0db1df358b666dbf8eda6ff4f3bb096c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Sat, 4 Jul 2026 00:42:45 +0300 Subject: [PATCH 32/32] Add skopeo-based script to verify Dockerfile base image digest pins. Compare pinned manifest-list digests against registry tags and report multi-arch platform coverage. --- scripts/check-dockerfile-digests.sh | 249 ++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100755 scripts/check-dockerfile-digests.sh diff --git a/scripts/check-dockerfile-digests.sh b/scripts/check-dockerfile-digests.sh new file mode 100755 index 00000000..01f0a282 --- /dev/null +++ b/scripts/check-dockerfile-digests.sh @@ -0,0 +1,249 @@ +#!/usr/bin/env bash +# Compare digest-pinned base images in a Dockerfile against registry tags via skopeo. +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: check-dockerfile-digests.sh [OPTIONS] DOCKERFILE + +Check digest-pinned base images in a Dockerfile against registry tags. + +Options: + -h, --help Show this help + --min-archs N Minimum platform count for multi-arch (default: 2) + --require PLATFORMS Comma-separated required platforms (e.g. linux/amd64,linux/arm64) + +Environment: + MIN_ARCHES Same as --min-archs + REQUIRE_PLATFORMS Same as --require + +Requires: skopeo, jq +EOF +} + +DOCKERFILE="" +MIN_ARCHES="${MIN_ARCHES:-2}" +REQUIRE_PLATFORMS="${REQUIRE_PLATFORMS:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) usage; exit 0 ;; + --min-archs) MIN_ARCHES="$2"; shift 2 ;; + --require) REQUIRE_PLATFORMS="$2"; shift 2 ;; + -*) echo "error: unknown option $1" >&2; exit 1 ;; + *) + if [[ -n "$DOCKERFILE" ]]; then + echo "error: only one Dockerfile path allowed" >&2 + exit 1 + fi + DOCKERFILE="$1" + shift + ;; + esac +done + +DOCKERFILE="${DOCKERFILE:-Dockerfile}" + +if ! command -v skopeo >/dev/null 2>&1; then + cat >&2 <<'EOF' +error: skopeo is required but not found in PATH. + +Install: + macOS: brew install skopeo + Fedora: dnf install skopeo + Ubuntu: apt install skopeo + +Skopeo reads registry auth from ~/.docker/config.json (docker login / podman login). +EOF + exit 1 +fi + +if ! command -v jq >/dev/null 2>&1; then + echo "error: jq is required" >&2 + exit 1 +fi + +if [[ ! -f "$DOCKERFILE" ]]; then + echo "error: Dockerfile not found: $DOCKERFILE" >&2 + exit 1 +fi + +COMMENT_TAG_RE='^[[:space:]]*#[[:space:]]*(.+)[[:space:]]—[[:space:]]*pin manifest list digest' + +resolve_tag() { + local image_ref="$1" prev_line="$2" + + if [[ "$image_ref" == *:* ]]; then + echo "${image_ref##*:}" + return 0 + fi + + if [[ "$prev_line" =~ $COMMENT_TAG_RE ]]; then + local hinted="${BASH_REMATCH[1]}" + if [[ "$hinted" == *:* ]]; then + echo "${hinted##*:}" + else + echo "latest" + fi + return 0 + fi + + echo "latest" +} + +image_without_tag() { + local image_ref="$1" + if [[ "$image_ref" == *:* ]]; then + echo "${image_ref%%:*}" + else + echo "$image_ref" + fi +} + +platforms_from_raw() { + jq -r ' + if (.manifests // .Manifests) then + [(.manifests // .Manifests)[] + | (.platform // .Platform) + | select(.architecture != null and .architecture != "unknown") + | "\(.os)/\(.architecture)" + ] | unique | .[] + elif (.architecture // .Architecture) then + "\(.os // .Os)/\(.architecture // .Architecture)" + else + empty + end + ' +} + +is_index() { + case "$1" in + application/vnd.docker.distribution.manifest.list.v2+json|application/vnd.oci.image.index.v1+json) + return 0 + ;; + *) + return 1 + ;; + esac +} + +report_multi_arch() { + local label="$1" raw_file="$2" + local media_type platforms count + + media_type="$(jq -r '.mediaType // .MediaType // empty' "$raw_file")" + + if is_index "$media_type"; then + platforms="$(platforms_from_raw < "$raw_file" | sort -u)" + count="$(printf '%s\n' "$platforms" | sed '/^$/d' | wc -l | tr -d ' ')" + echo " ${label}: multi-arch yes (${count} platforms)" + while IFS= read -r platform; do + [[ -n "$platform" ]] && echo " - ${platform}" + done <<< "$platforms" + + if [[ "$count" -lt "$MIN_ARCHES" ]]; then + echo " WARNING: ${label} has fewer than ${MIN_ARCHES} platforms." + return 1 + fi + else + local single + single="$(platforms_from_raw < "$raw_file" | head -n1)" + echo " ${label}: multi-arch no (single platform: ${single:-unknown})" + return 1 + fi + + if [[ -n "$REQUIRE_PLATFORMS" ]]; then + IFS=',' read -ra required <<< "$REQUIRE_PLATFORMS" + for req in "${required[@]}"; do + req="$(echo "$req" | xargs)" + if ! printf '%s\n' "$platforms" | grep -qx "$req"; then + echo " WARNING: ${label} missing required platform: ${req}" + return 1 + fi + done + fi + + return 0 +} + +skopeo_tag_ref() { + local image_ref="$1" tag="$2" + local image + image="$(image_without_tag "$image_ref")" + echo "docker://${image}:${tag}" +} + +skopeo_digest_ref() { + local image_ref="$1" digest="$2" + echo "docker://$(image_without_tag "$image_ref")@${digest}" +} + +skopeo_digest() { + # Override host OS/arch so multi-arch tags resolve to the manifest-list digest on macOS too. + skopeo inspect "$1" \ + --override-os linux \ + --override-arch amd64 \ + --format '{{.Digest}}' +} + +skopeo_raw_to_file() { + local dest="$1" + skopeo inspect --raw "$2" > "$dest" +} + +exit_code=0 +seen=() +prev_line="" + +while IFS= read -r line || [[ -n "$line" ]]; do + if [[ "$line" =~ ^FROM[[:space:]]+([^[:space:]]+)@sha256:([a-f0-9]{64}) ]]; then + image_ref="${BASH_REMATCH[1]}" + pinned="sha256:${BASH_REMATCH[2]}" + + if [[ " ${seen[*]:-} " == *" ${image_ref} "* ]]; then + prev_line="$line" + continue + fi + seen+=("$image_ref") + + tag="$(resolve_tag "$image_ref" "$prev_line")" + tag_ref="$(skopeo_tag_ref "$image_ref" "$tag")" + digest_ref="$(skopeo_digest_ref "$image_ref" "$pinned")" + + echo "==> ${image_ref} (tag: ${tag})" + echo " pinned: ${pinned}" + + latest_digest="$(skopeo_digest "$tag_ref")" + echo " latest: ${latest_digest}" + + latest_raw="$(mktemp)" + pinned_raw="$(mktemp)" + skopeo_raw_to_file "$latest_raw" "$tag_ref" + skopeo_raw_to_file "$pinned_raw" "$digest_ref" + + if [[ "$pinned" == "$latest_digest" ]]; then + echo " status: OK" + else + echo " status: OUTDATED" + echo + echo " Suggestion — update Dockerfile pin to latest multi-arch manifest list:" + echo " FROM ${image_ref}@${latest_digest}" + echo + exit_code=1 + fi + + report_multi_arch "Pinned digest" "$pinned_raw" || exit_code=1 + report_multi_arch "Latest tag" "$latest_raw" || exit_code=1 + + rm -f "$latest_raw" "$pinned_raw" + echo + fi + prev_line="$line" +done < "$DOCKERFILE" + +if [[ "${#seen[@]}" -eq 0 ]]; then + echo "error: no digest-pinned FROM lines in ${DOCKERFILE}" >&2 + exit 1 +fi + +exit "$exit_code"