From 8afbbe1204236e6bfed109c8e8ed35d62f3bfdee Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Fri, 13 Mar 2026 13:46:56 +0000 Subject: [PATCH 01/18] LedgerSecrets: take_dependency_on_secrets **before** taking locking local Mutex --- src/node/ledger_secrets.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/node/ledger_secrets.h b/src/node/ledger_secrets.h index c7f0c13f8dd1..a2f31df5334c 100644 --- a/src/node/ledger_secrets.h +++ b/src/node/ledger_secrets.h @@ -170,10 +170,10 @@ namespace ccf VersionedLedgerSecret get_latest(ccf::kv::ReadOnlyTx& tx) { - std::lock_guard guard(lock); - take_dependency_on_secrets(tx); + std::lock_guard guard(lock); + if (ledger_secrets.empty()) { throw std::logic_error( @@ -186,10 +186,10 @@ namespace ccf std::pair> get_latest_and_penultimate(ccf::kv::ReadOnlyTx& tx) { - std::lock_guard guard(lock); - take_dependency_on_secrets(tx); + std::lock_guard guard(lock); + if (ledger_secrets.empty()) { throw std::logic_error( @@ -209,10 +209,10 @@ namespace ccf ccf::kv::ReadOnlyTx& tx, std::optional up_to = std::nullopt) { - std::lock_guard guard(lock); - take_dependency_on_secrets(tx); + std::lock_guard guard(lock); + if (!up_to.has_value()) { return ledger_secrets; From f2127ea7752bedc89b9f489c4279fc8825b4cf3d Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Fri, 13 Mar 2026 14:32:33 +0000 Subject: [PATCH 02/18] Don't call force_become_primary (locking) immediately after construction - become primary _during_ construction! --- src/consensus/aft/raft.h | 34 ++++++++++++++++++++++++++++++++-- src/node/node_state.h | 26 +++++++++++++++++--------- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index 8754e430945e..9fddebaeacce 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -207,6 +207,16 @@ namespace aft std::unique_ptr ledger; std::shared_ptr channels; + // Describes how this node should force leadership at construction time, + // before any other thread can observe it (so no lock is needed). + struct StartupPrimaryInfo + { + // For recovery, these fields are populated from the recovered ledger + std::optional index = std::nullopt; + std::optional term = std::nullopt; + std::optional> view_history = std::nullopt; + }; + Aft( const ccf::consensus::Configuration& settings_, std::unique_ptr store_, @@ -214,7 +224,8 @@ namespace aft std::shared_ptr channels_, std::shared_ptr state_, std::shared_ptr rpc_request_context_, - bool public_only_ = false) : + bool public_only_ = false, + std::optional startup_primary = std::nullopt) : store(std::move(store_)), timeout_elapsed(0), @@ -236,7 +247,26 @@ namespace aft ledger(std::move(ledger_)), channels(std::move(channels_)) - {} + { + if (startup_primary.has_value()) + { + // Force leadership at construction time. No lock needed — this + // object is not yet visible to other threads. + const auto& sp = startup_primary.value(); + if (sp.index.has_value()) + { + // Recovery path + state->current_view = sp.term.value(); + state->last_idx = sp.index.value(); + state->commit_idx = sp.index.value(); + state->view_history.initialise(sp.view_history.value()); + state->view_history.update( + sp.index.value(), sp.term.value()); + } + state->current_view += starting_view_change; + become_leader(true); + } + } ~Aft() override = default; diff --git a/src/node/node_state.h b/src/node/node_state.h index 7ad1893e9c63..259e0a8eaa9e 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -365,6 +365,10 @@ namespace ccf NodeId self; std::shared_ptr node_encrypt_kp; ccf::crypto::Pem self_signed_node_cert; + + // Protects endorsed_node_cert only, to avoid taking the main lock in map + // hooks (which would create a lock cycle with consensus/snapshot locks) + pal::Mutex endorsed_cert_lock; std::optional endorsed_node_cert = std::nullopt; QuoteInfo quote_info; pal::PlatformAttestationMeasurement node_measurement; @@ -925,10 +929,10 @@ namespace ccf history->set_service_signing_identity( network.identity->get_key_pair(), config.cose_signatures); - setup_consensus(false, endorsed_node_cert); - - // Become the primary and force replication - consensus->force_become_primary(); + setup_consensus( + false, + endorsed_node_cert, + RaftType::StartupPrimaryInfo{}); LOG_INFO_FMT("Created new node {}", self); return {self_signed_node_cert, network.identity->cert}; @@ -1611,13 +1615,14 @@ namespace ccf } } - setup_consensus(true); + setup_consensus( + true, + std::nullopt, + RaftType::StartupPrimaryInfo{index, view, view_history}); auto_refresh_jwt_keys(); LOG_DEBUG_FMT("Restarting consensus at view: {} seqno: {}", view, index); - consensus->force_become_primary(index, view, view_history, index); - create_and_send_boot_request( new_term, false /* Restore consortium from ledger */); } @@ -2758,7 +2763,7 @@ namespace ccf "Could not find endorsed node certificate for {}", self)); } - std::lock_guard guard(lock); + std::lock_guard guard(endorsed_cert_lock); if (endorsed_node_cert.has_value()) { @@ -2999,6 +3004,8 @@ namespace ccf void setup_consensus( bool public_only = false, const std::optional& endorsed_node_certificate_ = + std::nullopt, + std::optional startup_primary = std::nullopt) { setup_n2n_channels(endorsed_node_certificate_); @@ -3016,7 +3023,8 @@ namespace ccf n2n_channels, shared_state, node_client, - public_only); + public_only, + startup_primary); network.tables->set_consensus(consensus); network.tables->set_snapshotter(snapshotter); From e50b4535b0a287afbd9561aecedfe507de388cf8 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Fri, 13 Mar 2026 15:21:26 +0000 Subject: [PATCH 03/18] Same pattern for init_as_backup --- src/consensus/aft/raft.h | 72 +++++++++++++++++++++++++++------------- src/node/node_state.h | 51 +++++++++++++++++----------- 2 files changed, 80 insertions(+), 43 deletions(-) diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index 9fddebaeacce..d1d38c262710 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -207,14 +207,28 @@ namespace aft std::unique_ptr ledger; std::shared_ptr channels; - // Describes how this node should force leadership at construction time, - // before any other thread can observe it (so no lock is needed). - struct StartupPrimaryInfo - { - // For recovery, these fields are populated from the recovered ledger - std::optional index = std::nullopt; - std::optional term = std::nullopt; - std::optional> view_history = std::nullopt; + enum class StartupRole + { + Primary, + Backup, + }; + + // Describes the initial role and state for this node at construction + // time, before any other thread can observe it (so no lock is needed). + struct StartupState + { + StartupRole role; + + // State to apply before becoming primary/backup. When nullopt for + // a primary, the node starts from scratch (genesis). + struct StateInfo + { + Index index; + Term term; + std::vector view_history; + Index recovery_start_index = 0; + }; + std::optional info = std::nullopt; }; Aft( @@ -225,7 +239,7 @@ namespace aft std::shared_ptr state_, std::shared_ptr rpc_request_context_, bool public_only_ = false, - std::optional startup_primary = std::nullopt) : + std::optional startup = std::nullopt) : store(std::move(store_)), timeout_elapsed(0), @@ -248,23 +262,35 @@ namespace aft ledger(std::move(ledger_)), channels(std::move(channels_)) { - if (startup_primary.has_value()) + if (startup.has_value()) { - // Force leadership at construction time. No lock needed — this - // object is not yet visible to other threads. - const auto& sp = startup_primary.value(); - if (sp.index.has_value()) + const auto& s = startup.value(); + if (s.info.has_value()) + { + const auto& si = s.info.value(); + if (s.role == StartupRole::Primary) + { + state->current_view = si.term; + state->last_idx = si.index; + state->commit_idx = si.index; + state->view_history.initialise(si.view_history); + state->view_history.update(si.index, si.term); + } + else + { + state->last_idx = si.index; + state->commit_idx = si.index; + state->view_history.initialise(si.view_history); + ledger->init(si.index, si.recovery_start_index); + become_aware_of_new_term(si.term); + } + } + + if (s.role == StartupRole::Primary) { - // Recovery path - state->current_view = sp.term.value(); - state->last_idx = sp.index.value(); - state->commit_idx = sp.index.value(); - state->view_history.initialise(sp.view_history.value()); - state->view_history.update( - sp.index.value(), sp.term.value()); + state->current_view += starting_view_change; + become_leader(true); } - state->current_view += starting_view_change; - become_leader(true); } } diff --git a/src/node/node_state.h b/src/node/node_state.h index 259e0a8eaa9e..72a36bad3fb4 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -932,7 +932,7 @@ namespace ccf setup_consensus( false, endorsed_node_cert, - RaftType::StartupPrimaryInfo{}); + RaftType::StartupState{RaftType::StartupRole::Primary}); LOG_INFO_FMT("Created new node {}", self); return {self_signed_node_cert, network.identity->cert}; @@ -1149,9 +1149,6 @@ namespace ccf } n2n_channels_cert = resp.network_info->endorsed_certificate.value(); - setup_consensus(resp.network_info->public_only, n2n_channels_cert); - auto_refresh_jwt_keys(); - if (resp.network_info->public_only) { last_recovered_signed_idx = @@ -1162,23 +1159,18 @@ namespace ccf View view = VIEW_UNKNOWN; std::vector view_history_ = {}; + ccf::kv::ConsensusHookPtrs snapshot_hooks; if (startup_snapshot_info) { // It is only possible to deserialise the entire snapshot now, // once the ledger secrets have been passed in by the network - ccf::kv::ConsensusHookPtrs hooks; deserialise_snapshot( network.tables, startup_snapshot_info->raw, - hooks, + snapshot_hooks, &view_history_, resp.network_info->public_only); - for (auto& hook : hooks) - { - hook->call(consensus.get()); - } - auto tx = network.tables->create_read_only_tx(); auto* signatures = tx.ro(network.signatures); auto sig = signatures->get(); @@ -1204,11 +1196,28 @@ namespace ccf view); } - consensus->init_as_backup( - network.tables->current_version(), - view, - view_history_, - last_recovered_signed_idx); + // Create consensus with backup init info baked in, so + // init_as_backup runs in the constructor before any other + // thread can see the consensus object. + setup_consensus( + resp.network_info->public_only, + n2n_channels_cert, + RaftType::StartupState{ + RaftType::StartupRole::Backup, + RaftType::StartupState::StateInfo{ + network.tables->current_version(), + view, + view_history_, + last_recovered_signed_idx}}); + + // Now that consensus exists, execute any hooks from the + // snapshot (e.g. ConfigurationChangeHook) + for (auto& hook : snapshot_hooks) + { + hook->call(consensus.get()); + } + + auto_refresh_jwt_keys(); snapshotter->set_last_snapshot_idx( network.tables->current_version()); @@ -1618,7 +1627,10 @@ namespace ccf setup_consensus( true, std::nullopt, - RaftType::StartupPrimaryInfo{index, view, view_history}); + RaftType::StartupState{ + RaftType::StartupRole::Primary, + RaftType::StartupState::StateInfo{ + index, view, view_history}}); auto_refresh_jwt_keys(); LOG_DEBUG_FMT("Restarting consensus at view: {} seqno: {}", view, index); @@ -3005,8 +3017,7 @@ namespace ccf bool public_only = false, const std::optional& endorsed_node_certificate_ = std::nullopt, - std::optional startup_primary = - std::nullopt) + std::optional startup = std::nullopt) { setup_n2n_channels(endorsed_node_certificate_); setup_cmd_forwarder(); @@ -3024,7 +3035,7 @@ namespace ccf shared_state, node_client, public_only, - startup_primary); + startup); network.tables->set_consensus(consensus); network.tables->set_snapshotter(snapshotter); From f624aae22bec4379381a4f1aaa21dfa2eaa9a38b Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Fri, 13 Mar 2026 16:32:15 +0000 Subject: [PATCH 04/18] Expand new mutex to cover self-signed cert too. Unpick some knots --- src/node/node_state.h | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 72a36bad3fb4..576b27c2331d 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -366,8 +366,9 @@ namespace ccf std::shared_ptr node_encrypt_kp; ccf::crypto::Pem self_signed_node_cert; - // Protects endorsed_node_cert only, to avoid taking the main lock in map - // hooks (which would create a lock cycle with consensus/snapshot locks) + // Protects endorsed_node_cert and self_signed_node_cert. This lock is + // used instead of the main NodeState lock in map/global hooks to avoid + // lock-order-inversion with KV maps_lock and consensus state->lock. pal::Mutex endorsed_cert_lock; std::optional endorsed_node_cert = std::nullopt; QuoteInfo quote_info; @@ -889,6 +890,7 @@ namespace ccf StartType start_type_, const ccf::StartupConfig& config_) { std::lock_guard guard(lock); + std::lock_guard cert_guard(endorsed_cert_lock); sm.expect(NodeStartupState::initialized); start_type = start_type_; @@ -987,7 +989,7 @@ namespace ccf auto join_client_cert = std::make_unique<::tls::Cert>( network_ca, - self_signed_node_cert, + get_self_signed_certificate_unsafe(), node_sign_kp->private_key_pem(), target_host); @@ -1346,7 +1348,7 @@ namespace ccf rpcsessions, rpc_map, node_sign_kp, - self_signed_node_cert); + get_self_signed_certificate_unsafe()); jwt_key_auto_refresh->start(); network.tables->set_map_hook( @@ -2327,7 +2329,12 @@ namespace ccf ccf::crypto::Pem get_self_signed_certificate() override { - std::lock_guard guard(lock); + std::lock_guard guard(endorsed_cert_lock); + return self_signed_node_cert; + } + + ccf::crypto::Pem get_self_signed_certificate_unsafe() + { return self_signed_node_cert; } @@ -2548,8 +2555,9 @@ namespace ccf bool send_create_request(const std::vector& packed) { + auto self_signed_cert = get_self_signed_certificate(); auto node_session = std::make_shared( - InvalidSessionId, self_signed_node_cert.raw()); + InvalidSessionId, self_signed_cert.raw()); auto ctx = make_rpc_context(node_session, packed); std::shared_ptr search = @@ -2825,7 +2833,7 @@ namespace ccf "Could not find endorsed node certificate for {}", self)); } - std::lock_guard guard(lock); + std::lock_guard guard(endorsed_cert_lock); LOG_INFO_FMT("[global] Accepting network connections"); accept_network_tls_connections(); @@ -3024,6 +3032,8 @@ namespace ccf auto shared_state = std::make_shared(self); + // Caller must ensure endorsed_cert_lock is held, or that the cert + // fields are stable (e.g. during single-threaded startup). auto node_client = std::make_shared( rpc_map, node_sign_kp, self_signed_node_cert, endorsed_node_cert); @@ -3199,6 +3209,7 @@ namespace ccf std::optional client_cert_key = std::nullopt; if (authenticate_as_node_client_certificate) { + std::lock_guard cert_guard(endorsed_cert_lock); client_cert = endorsed_node_cert ? *endorsed_node_cert : self_signed_node_cert; client_cert_key = node_sign_kp->private_key_pem(); From edbb79327a365c1a1eec94bf017f341f280d0d74 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 11:14:24 +0000 Subject: [PATCH 05/18] Flatten open_frontend calls --- src/node/node_state.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 576b27c2331d..86847e841ebc 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2437,11 +2437,6 @@ namespace ccf find_frontend(actor)->open(); } - void open_user_frontend() - { - open_frontend(ActorsType::users); - } - bool is_member_frontend_open_unsafe() { return find_frontend(ActorsType::members)->is_open(); @@ -2912,7 +2907,7 @@ namespace ccf network.identity->set_certificate(w->cert); if (w->status == ServiceStatus::OPEN) { - open_user_frontend(); + open_frontend(ActorsType::users); RINGBUFFER_WRITE_MESSAGE(::consensus::ledger_open, to_host); LOG_INFO_FMT("Service open at seqno {}", hook_version); From a62992554d7ceb0a8f0395f0a9fe094757921449 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 11:31:44 +0000 Subject: [PATCH 06/18] Implement asynchronous frontend opening to avoid locking issues during KV or consensus operations --- src/node/node_state.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 86847e841ebc..285c59d28ae6 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -906,7 +906,7 @@ namespace ccf config.node_certificate.initial_validity_days); accept_node_tls_connections(); - open_frontend(ActorsType::nodes); + open_frontend_async(ActorsType::nodes); // Signatures are only emitted on a timer once the public ledger has been // recovered @@ -1631,8 +1631,7 @@ namespace ccf std::nullopt, RaftType::StartupState{ RaftType::StartupRole::Primary, - RaftType::StartupState::StateInfo{ - index, view, view_history}}); + RaftType::StartupState::StateInfo{index, view, view_history}}); auto_refresh_jwt_keys(); LOG_DEBUG_FMT("Restarting consensus at view: {} seqno: {}", view, index); @@ -2437,6 +2436,16 @@ namespace ccf find_frontend(actor)->open(); } + void open_frontend_async(ActorsType actor) + { + // Schedule frontend opening on a task to avoid calling open() (which + // may take locks to set up frontend-specific systems) while KV or + // consensus locks are held — e.g. from global hooks during + // post_compact(). + ccf::tasks::add_task( + ccf::tasks::make_basic_task([this, actor]() { open_frontend(actor); })); + } + bool is_member_frontend_open_unsafe() { return find_frontend(ActorsType::members)->is_open(); @@ -2869,7 +2878,7 @@ namespace ccf } LOG_INFO_FMT("[global] Opening members frontend"); - open_frontend(ActorsType::members); + open_frontend_async(ActorsType::members); } })); @@ -2907,7 +2916,7 @@ namespace ccf network.identity->set_certificate(w->cert); if (w->status == ServiceStatus::OPEN) { - open_frontend(ActorsType::users); + open_frontend_async(ActorsType::users); RINGBUFFER_WRITE_MESSAGE(::consensus::ledger_open, to_host); LOG_INFO_FMT("Service open at seqno {}", hook_version); From c39a9aa5269dda42beb502099ea2faffb79aa56b Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 12:02:01 +0000 Subject: [PATCH 07/18] Remove transition_service_to_open lock, explain why --- src/node/node_state.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 285c59d28ae6..7293b821911e 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -1947,7 +1947,14 @@ namespace ccf ccf::kv::Tx& tx, AbstractGovernanceEffects::ServiceIdentities identities) override { - std::lock_guard guard(lock); + // NB: NodeState::lock is deliberately not held here. All member + // accesses in this function are either via the passed-in tx (which has + // its own KV-level protection), read-only on effectively-immutable + // fields (config, network.identity, sm), or on fields with their own + // locks (share_manager, LedgerSecrets). Holding NodeState::lock would + // create a lock-order-inversion with KV maps_lock, since this function + // is called from governance endpoints that may hold maps_lock (via + // apply_changes) or be called under it indirectly. auto* service = tx.rw(Tables::SERVICE); auto service_info = service->get(); From b208158b847b183e2038e073326ec5928235ad88 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 12:02:11 +0000 Subject: [PATCH 08/18] Remove unnecessary suppressions --- tsan_env_suppressions | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tsan_env_suppressions b/tsan_env_suppressions index 3ee6d5ecdee7..c2535499f0c9 100644 --- a/tsan_env_suppressions +++ b/tsan_env_suppressions @@ -4,13 +4,6 @@ # Awkward usages of '*' in this file like '/ds/*ring_buffer.h' are necessary to handle the cases where tsan thinks # src/ds/ring_buffer.h as src/ds/test/../ring_buffer.h for example -# For partitions_test -deadlock:*/store.h -deadlock:*/untyped_map.h - -# For governance_test -race:*/node/*rpc/*frontend.h - # Race between closedir and epoll_ctl. race:closedir race:epoll_ctl From 566c030e932fb6efab3e72b4ef20aca810097cd5 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 15:02:55 +0000 Subject: [PATCH 09/18] Tidy --- src/consensus/aft/raft.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index d1d38c262710..6f9edf84139f 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -207,7 +207,7 @@ namespace aft std::unique_ptr ledger; std::shared_ptr channels; - enum class StartupRole + enum class StartupRole : std::uint8_t { Primary, Backup, @@ -223,8 +223,8 @@ namespace aft // a primary, the node starts from scratch (genesis). struct StateInfo { - Index index; - Term term; + Index index = 0; + Term term = 0; std::vector view_history; Index recovery_start_index = 0; }; From e18333363334d7c32ebc0b3c460bff7f9ba85116 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 16:44:58 +0000 Subject: [PATCH 10/18] Fix data race in update_consensus() --- src/enclave/rpc_handler.h | 6 +++++- src/node/node_state.h | 11 ++++++++--- src/node/rpc/frontend.h | 36 ++++++++++-------------------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/src/enclave/rpc_handler.h b/src/enclave/rpc_handler.h index 2005d3bd1b44..663fc910dadb 100644 --- a/src/enclave/rpc_handler.h +++ b/src/enclave/rpc_handler.h @@ -14,6 +14,8 @@ namespace ccf::kv { class CommittableTx; + class Consensus; + class TxHistory; } namespace ccf @@ -31,7 +33,9 @@ namespace ccf virtual void set_cmd_forwarder( std::shared_ptr cmd_forwarder_) = 0; virtual void tick(std::chrono::milliseconds /*elapsed*/) {} - virtual void open() = 0; + virtual void open( + ccf::kv::Consensus* consensus = nullptr, + ccf::kv::TxHistory* history = nullptr) = 0; virtual bool is_open() = 0; // Used by rpcendpoint to process incoming client RPCs diff --git a/src/node/node_state.h b/src/node/node_state.h index 7293b821911e..bcaa34cfcd87 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -906,7 +906,6 @@ namespace ccf config.node_certificate.initial_validity_days); accept_node_tls_connections(); - open_frontend_async(ActorsType::nodes); // Signatures are only emitted on a timer once the public ledger has been // recovered @@ -936,6 +935,8 @@ namespace ccf endorsed_node_cert, RaftType::StartupState{RaftType::StartupRole::Primary}); + open_frontend(ActorsType::nodes); + LOG_INFO_FMT("Created new node {}", self); return {self_signed_node_cert, network.identity->cert}; } @@ -1212,6 +1213,8 @@ namespace ccf view_history_, last_recovered_signed_idx}}); + open_frontend(ActorsType::nodes); + // Now that consensus exists, execute any hooks from the // snapshot (e.g. ConfigurationChangeHook) for (auto& hook : snapshot_hooks) @@ -1632,6 +1635,8 @@ namespace ccf RaftType::StartupState{ RaftType::StartupRole::Primary, RaftType::StartupState::StateInfo{index, view, view_history}}); + + open_frontend(ActorsType::nodes); auto_refresh_jwt_keys(); LOG_DEBUG_FMT("Restarting consensus at view: {} seqno: {}", view, index); @@ -2440,7 +2445,7 @@ namespace ccf void open_frontend(ActorsType actor) { - find_frontend(actor)->open(); + find_frontend(actor)->open(consensus.get(), history.get()); } void open_frontend_async(ActorsType actor) @@ -2885,7 +2890,7 @@ namespace ccf } LOG_INFO_FMT("[global] Opening members frontend"); - open_frontend_async(ActorsType::members); + open_frontend(ActorsType::members); } })); diff --git a/src/node/rpc/frontend.h b/src/node/rpc/frontend.h index 9b41847d3aa8..62ae51d81c86 100644 --- a/src/node/rpc/frontend.h +++ b/src/node/rpc/frontend.h @@ -52,23 +52,6 @@ namespace ccf std::shared_ptr node_configuration_subsystem = nullptr; - void update_consensus() - { - auto* c = tables.get_consensus().get(); - - if (consensus != c) - { - consensus = c; - endpoints.set_consensus(consensus); - } - } - - void update_history() - { - history = tables.get_history().get(); - endpoints.set_history(history); - } - endpoints::EndpointDefinitionPtr find_endpoint( std::shared_ptr ctx, ccf::kv::CommittableTx& tx) { @@ -687,8 +670,6 @@ namespace ccf } ++attempts; - update_history(); - endpoint = find_endpoint(ctx, *tx_p); if (endpoint == nullptr) { @@ -972,13 +953,22 @@ namespace ccf cmd_forwarder = cmd_forwarder_; } - void open() override + void open( + ccf::kv::Consensus* consensus_ = nullptr, + ccf::kv::TxHistory* history_ = nullptr) override { std::lock_guard mguard(open_lock); if (!is_open_) { LOG_INFO_FMT("Opening frontend"); is_open_ = true; + + consensus = consensus_; + endpoints.set_consensus(consensus); + + history = history_; + endpoints.set_history(history); + endpoints.init_handlers(); } } @@ -994,7 +984,6 @@ namespace ccf { if (endpoints.request_needs_root(ctx)) { - update_history(); if (history != nullptr) { // Warning: Retrieving the current TxID and root from the history @@ -1019,8 +1008,6 @@ namespace ccf */ void process(std::shared_ptr ctx) override { - update_consensus(); - // NB: If we want to re-execute on backups, the original command could // be propagated from here process_command(ctx); @@ -1038,7 +1025,6 @@ namespace ccf "Processing forwarded command with unitialised forwarded context"); } - update_consensus(); process_command(ctx); if (ctx->response_is_pending) { @@ -1050,8 +1036,6 @@ namespace ccf void tick(std::chrono::milliseconds elapsed) override { - update_consensus(); - endpoints.tick(elapsed); } }; From a27b85d6b245927c78f2a3a669ddd217104148a1 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 19:19:11 +0000 Subject: [PATCH 11/18] setup_basic_hooks, independently and early --- src/node/node_state.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index bcaa34cfcd87..fb2e72546a86 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -910,6 +910,7 @@ namespace ccf // Signatures are only emitted on a timer once the public ledger has been // recovered setup_history(); + setup_basic_hooks(); setup_snapshotter(); setup_encryptor(); @@ -2890,7 +2891,7 @@ namespace ccf } LOG_INFO_FMT("[global] Opening members frontend"); - open_frontend(ActorsType::members); + open_frontend_async(ActorsType::members); } })); @@ -2911,7 +2912,7 @@ namespace ccf ->public_key_pem(); if (hook_pubk_pem != current_pubk_pem) { - LOG_TRACE_FMT( + LOG_INFO_FMT( "Ignoring historical service open at seqno {} for {}", hook_version, w->cert.str()); @@ -3160,7 +3161,9 @@ namespace ccf } })); - setup_basic_hooks(); + // Note: setup_basic_hooks() is intentionally NOT called here. It is + // called earlier in create(), before snapshot deserialization, to ensure + // global hooks are registered before any data is committed to the maps. } void setup_snapshotter() From faa669b342f02fd2a3ec3e9b2c4fb986ed8784a2 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Tue, 17 Mar 2026 19:37:50 +0000 Subject: [PATCH 12/18] Simpler - separate open and set_consensus_and_history --- src/enclave/rpc_handler.h | 6 +++--- src/node/node_state.h | 13 +++++++------ src/node/rpc/frontend.h | 21 +++++++++++---------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/enclave/rpc_handler.h b/src/enclave/rpc_handler.h index 663fc910dadb..92cd49e60efb 100644 --- a/src/enclave/rpc_handler.h +++ b/src/enclave/rpc_handler.h @@ -33,10 +33,10 @@ namespace ccf virtual void set_cmd_forwarder( std::shared_ptr cmd_forwarder_) = 0; virtual void tick(std::chrono::milliseconds /*elapsed*/) {} - virtual void open( - ccf::kv::Consensus* consensus = nullptr, - ccf::kv::TxHistory* history = nullptr) = 0; + virtual void open() = 0; virtual bool is_open() = 0; + virtual void set_consensus_and_history( + ccf::kv::Consensus* consensus, ccf::kv::TxHistory* history) = 0; // Used by rpcendpoint to process incoming client RPCs virtual void process(std::shared_ptr ctx) = 0; diff --git a/src/node/node_state.h b/src/node/node_state.h index fb2e72546a86..ae21e88d3e32 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -906,6 +906,7 @@ namespace ccf config.node_certificate.initial_validity_days); accept_node_tls_connections(); + open_frontend(ActorsType::nodes); // Signatures are only emitted on a timer once the public ledger has been // recovered @@ -936,8 +937,6 @@ namespace ccf endorsed_node_cert, RaftType::StartupState{RaftType::StartupRole::Primary}); - open_frontend(ActorsType::nodes); - LOG_INFO_FMT("Created new node {}", self); return {self_signed_node_cert, network.identity->cert}; } @@ -1214,8 +1213,6 @@ namespace ccf view_history_, last_recovered_signed_idx}}); - open_frontend(ActorsType::nodes); - // Now that consensus exists, execute any hooks from the // snapshot (e.g. ConfigurationChangeHook) for (auto& hook : snapshot_hooks) @@ -1637,7 +1634,6 @@ namespace ccf RaftType::StartupRole::Primary, RaftType::StartupState::StateInfo{index, view, view_history}}); - open_frontend(ActorsType::nodes); auto_refresh_jwt_keys(); LOG_DEBUG_FMT("Restarting consensus at view: {} seqno: {}", view, index); @@ -2446,7 +2442,7 @@ namespace ccf void open_frontend(ActorsType actor) { - find_frontend(actor)->open(consensus.get(), history.get()); + find_frontend(actor)->open(); } void open_frontend_async(ActorsType actor) @@ -3067,6 +3063,11 @@ namespace ccf network.tables->set_consensus(consensus); network.tables->set_snapshotter(snapshotter); + for (auto& [actor, fe] : rpc_map->frontends()) + { + fe->set_consensus_and_history(consensus.get(), history.get()); + } + // When a node is added, even locally, inform consensus so that it // can add a new active configuration. network.tables->set_map_hook( diff --git a/src/node/rpc/frontend.h b/src/node/rpc/frontend.h index 62ae51d81c86..5040d03aaf94 100644 --- a/src/node/rpc/frontend.h +++ b/src/node/rpc/frontend.h @@ -953,26 +953,27 @@ namespace ccf cmd_forwarder = cmd_forwarder_; } - void open( - ccf::kv::Consensus* consensus_ = nullptr, - ccf::kv::TxHistory* history_ = nullptr) override + void open() override { std::lock_guard mguard(open_lock); if (!is_open_) { LOG_INFO_FMT("Opening frontend"); is_open_ = true; - - consensus = consensus_; - endpoints.set_consensus(consensus); - - history = history_; - endpoints.set_history(history); - endpoints.init_handlers(); } } + void set_consensus_and_history( + ccf::kv::Consensus* consensus_, ccf::kv::TxHistory* history_) override + { + consensus = consensus_; + endpoints.set_consensus(consensus); + + history = history_; + endpoints.set_history(history); + } + bool is_open() override { std::lock_guard mguard(open_lock); From 8bb4a70c6db9d09e9acfafff489e5035cef5094e Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 18 Mar 2026 10:18:46 +0000 Subject: [PATCH 13/18] set_consensus_and_history in unit tests --- src/node/rpc/test/frontend_test.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/node/rpc/test/frontend_test.cpp b/src/node/rpc/test/frontend_test.cpp index 82f0ce9a3289..1c878b724b89 100644 --- a/src/node/rpc/test/frontend_test.cpp +++ b/src/node/rpc/test/frontend_test.cpp @@ -1055,6 +1055,8 @@ TEST_CASE("Forwarding" * doctest::test_suite("forwarding")) auto primary_consensus = std::make_shared(); network_primary.tables->set_consensus(primary_consensus); + user_frontend_primary.set_consensus_and_history( + primary_consensus.get(), nullptr); auto channel_stub = std::make_shared(); auto rpc_responder = std::weak_ptr(); @@ -1064,6 +1066,8 @@ TEST_CASE("Forwarding" * doctest::test_suite("forwarding")) auto backup_consensus = std::make_shared(); network_backup.tables->set_consensus(backup_consensus); + user_frontend_backup.set_consensus_and_history( + backup_consensus.get(), nullptr); auto simple_call = create_simple_request(); auto serialized_call = simple_call.build_request(); @@ -1089,6 +1093,8 @@ TEST_CASE("Forwarding" * doctest::test_suite("forwarding")) { INFO("Read command is not forwarded to primary"); TestUserFrontend user_frontend_backup_read(*network_backup.tables); + user_frontend_backup_read.set_consensus_and_history( + backup_consensus.get(), nullptr); REQUIRE(channel_stub->is_empty()); user_frontend_backup_read.process(backup_ctx); @@ -1161,6 +1167,8 @@ TEST_CASE("Forwarding" * doctest::test_suite("forwarding")) INFO("Read command is now forwarded to primary on this session"); TestUserFrontend user_frontend_backup_read(*network_backup.tables); + user_frontend_backup_read.set_consensus_and_history( + backup_consensus.get(), nullptr); user_frontend_backup_read.set_cmd_forwarder(backup_forwarder); REQUIRE(channel_stub->is_empty()); @@ -1203,6 +1211,8 @@ TEST_CASE("Nodefrontend forwarding" * doctest::test_suite("forwarding")) auto primary_consensus = std::make_shared(); network_primary.tables->set_consensus(primary_consensus); + node_frontend_primary.set_consensus_and_history( + primary_consensus.get(), nullptr); auto rpc_responder = std::weak_ptr(); auto rpc_map = std::weak_ptr(); @@ -1212,6 +1222,8 @@ TEST_CASE("Nodefrontend forwarding" * doctest::test_suite("forwarding")) auto backup_consensus = std::make_shared(); network_backup.tables->set_consensus(backup_consensus); + node_frontend_backup.set_consensus_and_history( + backup_consensus.get(), nullptr); auto write_req = create_simple_request(); auto serialized_call = write_req.build_request(); @@ -1254,6 +1266,8 @@ TEST_CASE("Userfrontend forwarding" * doctest::test_suite("forwarding")) auto primary_consensus = std::make_shared(); network_primary.tables->set_consensus(primary_consensus); + user_frontend_primary.set_consensus_and_history( + primary_consensus.get(), nullptr); auto rpc_responder = std::weak_ptr(); auto rpc_map = std::weak_ptr(); @@ -1263,6 +1277,8 @@ TEST_CASE("Userfrontend forwarding" * doctest::test_suite("forwarding")) auto backup_consensus = std::make_shared(); network_backup.tables->set_consensus(backup_consensus); + user_frontend_backup.set_consensus_and_history( + backup_consensus.get(), nullptr); auto write_req = create_simple_request(); auto serialized_call = write_req.build_request(); @@ -1305,6 +1321,8 @@ TEST_CASE("Memberfrontend forwarding" * doctest::test_suite("forwarding")) auto primary_consensus = std::make_shared(); network_primary.tables->set_consensus(primary_consensus); + member_frontend_primary.set_consensus_and_history( + primary_consensus.get(), nullptr); auto rpc_responder = std::weak_ptr(); auto rpc_map = std::weak_ptr(); @@ -1314,6 +1332,8 @@ TEST_CASE("Memberfrontend forwarding" * doctest::test_suite("forwarding")) auto backup_consensus = std::make_shared(); network_backup.tables->set_consensus(backup_consensus); + member_frontend_backup.set_consensus_and_history( + backup_consensus.get(), nullptr); auto write_req = create_simple_request(); auto serialized_call = write_req.build_request(); From cd9ec9a36bb99f933b1918a62acf4c47f7ea27e2 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 18 Mar 2026 13:03:19 +0000 Subject: [PATCH 14/18] Consistently set up hooks before deser --- src/node/node_state.h | 194 +++++++++++++++++++++--------------------- 1 file changed, 95 insertions(+), 99 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index ae21e88d3e32..6a2d06b26e25 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -911,8 +911,8 @@ namespace ccf // Signatures are only emitted on a timer once the public ledger has been // recovered setup_history(); - setup_basic_hooks(); setup_snapshotter(); + setup_basic_hooks(); setup_encryptor(); initiate_quote_generation(); @@ -2931,6 +2931,100 @@ namespace ccf LOG_INFO_FMT("Service open at seqno {}", hook_version); } })); + + // When a node is added, even locally, inform consensus so that it + // can add a new active configuration. + network.tables->set_map_hook( + network.nodes.get_name(), + Nodes::wrap_map_hook( + [](ccf::kv::Version version, const Nodes::Write& w) + -> ccf::kv::ConsensusHookPtr { + return std::make_unique(version, w); + })); + + // Note: The Signatures hook and SerialisedMerkleTree hook are separate + // because the signature and the Merkle tree are recorded in distinct + // tables (for serialisation performance reasons). However here, they are + // expected to always be called together and for the same version as they + // are always written by each signature transaction. + + network.tables->set_map_hook( + network.cose_signatures.get_name(), + CoseSignatures::wrap_map_hook( + [s = this->snapshotter]( + ccf::kv::Version version, + const CoseSignatures::Write& w) -> ccf::kv::ConsensusHookPtr { + assert(w.has_value()); + s->record_cose_signature(version, w.value()); + return {nullptr}; + })); + + network.tables->set_map_hook( + network.serialise_tree.get_name(), + SerialisedMerkleTree::wrap_map_hook( + [s = this->snapshotter]( + ccf::kv::Version version, + const SerialisedMerkleTree::Write& w) -> ccf::kv::ConsensusHookPtr { + assert(w.has_value()); + const auto& tree = w.value(); + s->record_serialised_tree(version, tree); + return {nullptr}; + })); + + network.tables->set_map_hook( + network.snapshot_evidence.get_name(), + SnapshotEvidence::wrap_map_hook( + [s = this->snapshotter]( + ccf::kv::Version version, + const SnapshotEvidence::Write& w) -> ccf::kv::ConsensusHookPtr { + assert(w.has_value()); + auto snapshot_evidence = w.value(); + s->record_snapshot_evidence_idx(version, snapshot_evidence); + return {nullptr}; + })); + + network.tables->set_global_hook( + network.snapshot_evidence.get_name(), + SnapshotEvidence::wrap_commit_hook( + [this]( + [[maybe_unused]] ccf::kv::Version version, + const SnapshotEvidence::Write& w) { + if (!w.has_value()) + { + return; + } + + auto snapshot_evidence = w.value(); + + // If backup snapshot fetching is enabled and this node is a + // backup, schedule a fetch task + if ( + config.snapshots.backup_fetch.enabled && consensus != nullptr && + !consensus->is_primary()) + { + std::lock_guard guard(lock); + if ( + backup_snapshot_fetch_task != nullptr && + !backup_snapshot_fetch_task->is_cancelled()) + { + LOG_DEBUG_FMT( + "Backup snapshot fetch already in progress, skipping"); + } + else + { + LOG_INFO_FMT( + "Snapshot evidence detected on backup - scheduling " + "snapshot fetch from primary (since seqno: {})", + snapshot_evidence.version); + backup_snapshot_fetch_task = + std::make_shared( + config.snapshots, + snapshot_evidence.version - 1 /* YIKES */, + this); + ccf::tasks::add_task(backup_snapshot_fetch_task); + } + } + })); } ccf::kv::Version get_last_recovered_signed_idx() override @@ -3067,104 +3161,6 @@ namespace ccf { fe->set_consensus_and_history(consensus.get(), history.get()); } - - // When a node is added, even locally, inform consensus so that it - // can add a new active configuration. - network.tables->set_map_hook( - network.nodes.get_name(), - Nodes::wrap_map_hook( - [](ccf::kv::Version version, const Nodes::Write& w) - -> ccf::kv::ConsensusHookPtr { - return std::make_unique(version, w); - })); - - // Note: The Signatures hook and SerialisedMerkleTree hook are separate - // because the signature and the Merkle tree are recorded in distinct - // tables (for serialisation performance reasons). However here, they are - // expected to always be called together and for the same version as they - // are always written by each signature transaction. - - network.tables->set_map_hook( - network.cose_signatures.get_name(), - CoseSignatures::wrap_map_hook( - [s = this->snapshotter]( - ccf::kv::Version version, - const CoseSignatures::Write& w) -> ccf::kv::ConsensusHookPtr { - assert(w.has_value()); - s->record_cose_signature(version, w.value()); - return {nullptr}; - })); - - network.tables->set_map_hook( - network.serialise_tree.get_name(), - SerialisedMerkleTree::wrap_map_hook( - [s = this->snapshotter]( - ccf::kv::Version version, - const SerialisedMerkleTree::Write& w) -> ccf::kv::ConsensusHookPtr { - assert(w.has_value()); - const auto& tree = w.value(); - s->record_serialised_tree(version, tree); - return {nullptr}; - })); - - network.tables->set_map_hook( - network.snapshot_evidence.get_name(), - SnapshotEvidence::wrap_map_hook( - [s = this->snapshotter]( - ccf::kv::Version version, - const SnapshotEvidence::Write& w) -> ccf::kv::ConsensusHookPtr { - assert(w.has_value()); - auto snapshot_evidence = w.value(); - s->record_snapshot_evidence_idx(version, snapshot_evidence); - return {nullptr}; - })); - - network.tables->set_global_hook( - network.snapshot_evidence.get_name(), - SnapshotEvidence::wrap_commit_hook( - [this]( - [[maybe_unused]] ccf::kv::Version version, - const SnapshotEvidence::Write& w) { - if (!w.has_value()) - { - return; - } - - auto snapshot_evidence = w.value(); - - // If backup snapshot fetching is enabled and this node is a - // backup, schedule a fetch task - if ( - config.snapshots.backup_fetch.enabled && consensus != nullptr && - !consensus->is_primary()) - { - std::lock_guard guard(lock); - if ( - backup_snapshot_fetch_task != nullptr && - !backup_snapshot_fetch_task->is_cancelled()) - { - LOG_DEBUG_FMT( - "Backup snapshot fetch already in progress, skipping"); - } - else - { - LOG_INFO_FMT( - "Snapshot evidence detected on backup - scheduling " - "snapshot fetch from primary (since seqno: {})", - snapshot_evidence.version); - backup_snapshot_fetch_task = - std::make_shared( - config.snapshots, - snapshot_evidence.version - 1 /* YIKES */, - this); - ccf::tasks::add_task(backup_snapshot_fetch_task); - } - } - })); - - // Note: setup_basic_hooks() is intentionally NOT called here. It is - // called earlier in create(), before snapshot deserialization, to ensure - // global hooks are registered before any data is committed to the maps. } void setup_snapshotter() From 7a9a85c80642f9a76408da96863e9387f26422f5 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 18 Mar 2026 14:48:13 +0000 Subject: [PATCH 15/18] Avoid nullptr deref on hooks pre-consensus --- src/node/node_state.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 94f8a1682adf..116b535c7fbd 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -1424,10 +1424,15 @@ namespace ccf } ++last_recovered_idx; - // Not synchronised because consensus isn't effectively running then - for (auto& hook : r->get_hooks()) + // Consensus may not exist yet, in which case there's nothing for + // these hooks to do + if (consensus.get()) { - hook->call(consensus.get()); + // Not synchronised because consensus isn't effectively running then + for (auto& hook : r->get_hooks()) + { + hook->call(consensus.get()); + } } } catch (const std::exception& e) From da6ff7d53fd5e3b419098391182684653ce28d40 Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 29 Apr 2026 10:07:29 +0000 Subject: [PATCH 16/18] tidy --- src/node/node_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 3977827b2880..81b397556b70 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -1487,7 +1487,7 @@ namespace ccf // Consensus may not exist yet, in which case there's nothing for // these hooks to do - if (consensus.get()) + if (consensus != nullptr) { // Not synchronised because consensus isn't effectively running then for (auto& hook : r->get_hooks()) From ab897393ae67b1a6a46a5d8c7ba9bc9a0a325b3d Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 29 Apr 2026 13:00:15 +0000 Subject: [PATCH 17/18] atomics for consensus and history ownership, narrower lock scope in create, avoid refs to certs --- src/node/node_client.h | 4 +-- src/node/node_state.h | 51 ++++++++++++++++++++++++------------ src/node/rpc/frontend.h | 57 +++++++++++++++++++++++------------------ 3 files changed, 69 insertions(+), 43 deletions(-) diff --git a/src/node/node_client.h b/src/node/node_client.h index cf74b8df37a6..6b5c95901f4b 100644 --- a/src/node/node_client.h +++ b/src/node/node_client.h @@ -14,8 +14,8 @@ namespace ccf protected: std::shared_ptr rpc_map; ccf::crypto::ECKeyPairPtr node_sign_kp; - const ccf::crypto::Pem& self_signed_node_cert; - const std::optional& endorsed_node_cert = std::nullopt; + ccf::crypto::Pem self_signed_node_cert; + std::optional endorsed_node_cert; public: NodeClient( diff --git a/src/node/node_state.h b/src/node/node_state.h index 81b397556b70..86c7206a37d4 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -950,7 +950,6 @@ namespace ccf StartType start_type_, const ccf::StartupConfig& config_) { std::lock_guard guard(lock); - std::lock_guard cert_guard(endorsed_cert_lock); sm.expect(NodeStartupState::initialized); start_type = start_type_; @@ -958,12 +957,23 @@ namespace ccf subject_alt_names = get_subject_alternative_names(); js::register_class_ids(); - self_signed_node_cert = create_self_signed_cert( - node_sign_kp, - config.node_certificate.subject_name, - subject_alt_names, - config.startup_host_time, - config.node_certificate.initial_validity_days); + + // Hold endorsed_cert_lock only while mutating self_signed_node_cert. + // Must be released before initiate_quote_generation(), which may call + // Store::deserialise_snapshot and acquire maps_lock — holding both + // would invert the lock order with KV hooks that acquire + // endorsed_cert_lock under maps_lock. + ccf::crypto::Pem self_signed_cert_snapshot; + { + std::lock_guard cert_guard(endorsed_cert_lock); + self_signed_node_cert = create_self_signed_cert( + node_sign_kp, + config.node_certificate.subject_name, + subject_alt_names, + config.startup_host_time, + config.node_certificate.initial_validity_days); + self_signed_cert_snapshot = self_signed_node_cert; + } accept_node_tls_connections(); open_frontend(ActorsType::nodes); @@ -992,18 +1002,21 @@ namespace ccf history->set_service_signing_identity( network.identity->get_key_pair(), config.cose_signatures); + // No endorsed certificate exists yet on the Start path — it is + // created later by the boot request and applied via the + // node_endorsed_certificates hook. setup_consensus( false, - endorsed_node_cert, + std::nullopt, RaftType::StartupState{RaftType::StartupRole::Primary}); LOG_INFO_FMT("Created new node {}", self); - return {self_signed_node_cert, network.identity->cert}; + return {self_signed_cert_snapshot, network.identity->cert}; } case StartType::Join: { LOG_INFO_FMT("Created join node {}", self); - return {self_signed_node_cert, {}}; + return {self_signed_cert_snapshot, {}}; } case StartType::Recover: { @@ -1024,7 +1037,7 @@ namespace ccf config.initial_service_certificate_validity_days); LOG_INFO_FMT("Created recovery node {}", self); - return {self_signed_node_cert, network.identity->cert}; + return {self_signed_cert_snapshot, network.identity->cert}; } default: { @@ -2855,8 +2868,11 @@ namespace ccf retired_committed_nodes.push_back(node_id); } } - consensus->set_retired_committed( - hook_version, retired_committed_nodes); + if (consensus != nullptr) + { + consensus->set_retired_committed( + hook_version, retired_committed_nodes); + } })); // Service-endorsed certificate is passed to history as early as _local_ @@ -3238,10 +3254,13 @@ namespace ccf auto shared_state = std::make_shared(self); - // Caller must ensure endorsed_cert_lock is held, or that the cert - // fields are stable (e.g. during single-threaded startup). + // endorsed_node_certificate_ is the endorsed cert available at this + // point (from the join response, or nullopt on Start/Recover where it + // arrives later via the node_endorsed_certificates hook). + // self_signed_node_cert is stable here: all callers are either during + // single-threaded startup or holding NodeState::lock. auto node_client = std::make_shared( - rpc_map, node_sign_kp, self_signed_node_cert, endorsed_node_cert); + rpc_map, node_sign_kp, self_signed_node_cert, endorsed_node_certificate_); consensus = std::make_shared( consensus_config, diff --git a/src/node/rpc/frontend.h b/src/node/rpc/frontend.h index cd107db0aa81..6555d13ecda7 100644 --- a/src/node/rpc/frontend.h +++ b/src/node/rpc/frontend.h @@ -25,6 +25,7 @@ #define FMT_HEADER_ONLY +#include #include #include #include @@ -42,9 +43,9 @@ namespace ccf ccf::pal::Mutex open_lock; bool is_open_ = false; - ccf::kv::Consensus* consensus{nullptr}; + std::atomic consensus{nullptr}; std::shared_ptr cmd_forwarder; - ccf::kv::TxHistory* history{nullptr}; + std::atomic history{nullptr}; size_t sig_tx_interval = 5000; std::chrono::milliseconds sig_ms_interval = std::chrono::milliseconds(1000); @@ -108,7 +109,7 @@ namespace ccf const endpoints::EndpointDefinitionPtr& endpoint) { auto interface_id = ctx->get_session_context()->interface_id; - if ((consensus != nullptr) && interface_id) + if ((consensus.load() != nullptr) && interface_id) { if (!node_configuration_subsystem) { @@ -232,7 +233,7 @@ namespace ccf target_node_its; const auto nodes = InternalTablesAccess::get_trusted_nodes(tx); { - const auto primary_id = consensus->primary(); + const auto primary_id = consensus.load()->primary(); if (seeking_primary && primary_id.has_value()) { target_node_its.push_back(nodes.find(primary_id.value())); @@ -300,7 +301,7 @@ namespace ccf case (ccf::endpoints::RedirectionStrategy::ToPrimary): { const bool is_primary = - (consensus != nullptr) && consensus->can_replicate(); + (consensus.load() != nullptr) && consensus.load()->can_replicate(); if (!is_primary) { @@ -335,7 +336,7 @@ namespace ccf case (ccf::endpoints::RedirectionStrategy::ToBackup): { const bool is_backup = - (consensus != nullptr) && !consensus->can_replicate(); + (consensus.load() != nullptr) && !consensus.load()->can_replicate(); if (!is_backup) { @@ -405,9 +406,10 @@ namespace ccf bool check_session_consistency(std::shared_ptr ctx) { - if (consensus != nullptr) + auto c = consensus.load(); + if (c != nullptr) { - auto current_view = consensus->get_view(); + auto current_view = c->get_view(); auto session_ctx = ctx->get_session_context(); if (!session_ctx->active_view.has_value()) { @@ -528,7 +530,7 @@ namespace ccf return; } - if (!cmd_forwarder || (consensus == nullptr)) + if (!cmd_forwarder || (consensus.load() == nullptr)) { ctx->set_error( HTTP_STATUS_INTERNAL_SERVER_ERROR, @@ -557,7 +559,7 @@ namespace ccf return; } - auto primary_id = consensus->primary(); + auto primary_id = consensus.load()->primary(); if (!primary_id.has_value()) { ctx->set_error( @@ -637,11 +639,12 @@ namespace ccf constexpr auto max_attempts = 30; while (attempts < max_attempts) { - if (consensus != nullptr) + auto c = consensus.load(); + if (c != nullptr) { if ( endpoints.apply_uncommitted_tx_backpressure() && - consensus->is_at_max_capacity()) + c->is_at_max_capacity()) { ctx->set_error( HTTP_STATUS_SERVICE_UNAVAILABLE, @@ -707,9 +710,10 @@ namespace ccf } else { + auto c2 = consensus.load(); bool is_primary = - (consensus == nullptr) || consensus->can_replicate(); - const bool forwardable = (consensus != nullptr); + (c2 == nullptr) || c2->can_replicate(); + const bool forwardable = (c2 != nullptr); if (!is_primary && forwardable) { @@ -822,7 +826,7 @@ namespace ccf case ccf::kv::CommitResult::SUCCESS: { auto tx_id_opt = tx.get_txid(); - if (tx_id_opt.has_value() && consensus != nullptr) + if (tx_id_opt.has_value() && consensus.load() != nullptr) { ccf::TxID tx_id = tx_id_opt.value(); @@ -872,11 +876,13 @@ namespace ccf } } - if ( - consensus != nullptr && consensus->can_replicate() && - history != nullptr) { - history->try_emit_signature(); + auto c3 = consensus.load(); + auto h = history.load(); + if (c3 != nullptr && c3->can_replicate() && h != nullptr) + { + h->try_emit_signature(); + } } return; @@ -1002,11 +1008,11 @@ namespace ccf void set_consensus_and_history( ccf::kv::Consensus* consensus_, ccf::kv::TxHistory* history_) override { - consensus = consensus_; - endpoints.set_consensus(consensus); + consensus.store(consensus_); + endpoints.set_consensus(consensus_); - history = history_; - endpoints.set_history(history); + history.store(history_); + endpoints.set_history(history_); } bool is_open() override @@ -1020,14 +1026,15 @@ namespace ccf { if (endpoints.request_needs_root(ctx)) { - if (history != nullptr) + auto h = history.load(); + if (h != nullptr) { // Warning: Retrieving the current TxID and root from the history // should only ever be used for the proposal creation endpoint and // nothing else. Many bad things could happen otherwise (e.g. breaking // session consistency). const auto& [txid, root, term_of_next_version] = - history->get_replicated_state_txid_and_root(); + h->get_replicated_state_txid_and_root(); tx.set_read_txid(txid, term_of_next_version); tx.set_root_at_read_version(root); } From b7fddf5c207f8a91e9c3dabe028d2eaa5268dd1b Mon Sep 17 00:00:00 2001 From: Eddy Ashton Date: Wed, 29 Apr 2026 13:42:10 +0000 Subject: [PATCH 18/18] Tidy and format --- src/node/node_client.h | 8 ++++---- src/node/node_state.h | 5 ++++- src/node/rpc/frontend.h | 17 +++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/node/node_client.h b/src/node/node_client.h index 6b5c95901f4b..31bfa243b0c9 100644 --- a/src/node/node_client.h +++ b/src/node/node_client.h @@ -21,12 +21,12 @@ namespace ccf NodeClient( std::shared_ptr rpc_map_, ccf::crypto::ECKeyPairPtr node_sign_kp_, - const ccf::crypto::Pem& self_signed_node_cert_, - const std::optional& endorsed_node_cert_) : + ccf::crypto::Pem self_signed_node_cert_, + std::optional endorsed_node_cert_) : rpc_map(std::move(rpc_map_)), node_sign_kp(std::move(node_sign_kp_)), - self_signed_node_cert(self_signed_node_cert_), - endorsed_node_cert(endorsed_node_cert_) + self_signed_node_cert(std::move(self_signed_node_cert_)), + endorsed_node_cert(std::move(endorsed_node_cert_)) {} virtual ~NodeClient() = default; diff --git a/src/node/node_state.h b/src/node/node_state.h index 86c7206a37d4..da952eeb4d65 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -3260,7 +3260,10 @@ namespace ccf // self_signed_node_cert is stable here: all callers are either during // single-threaded startup or holding NodeState::lock. auto node_client = std::make_shared( - rpc_map, node_sign_kp, self_signed_node_cert, endorsed_node_certificate_); + rpc_map, + node_sign_kp, + self_signed_node_cert, + endorsed_node_certificate_); consensus = std::make_shared( consensus_config, diff --git a/src/node/rpc/frontend.h b/src/node/rpc/frontend.h index 6555d13ecda7..b67e4b75953e 100644 --- a/src/node/rpc/frontend.h +++ b/src/node/rpc/frontend.h @@ -406,7 +406,7 @@ namespace ccf bool check_session_consistency(std::shared_ptr ctx) { - auto c = consensus.load(); + auto* c = consensus.load(); if (c != nullptr) { auto current_view = c->get_view(); @@ -639,7 +639,7 @@ namespace ccf constexpr auto max_attempts = 30; while (attempts < max_attempts) { - auto c = consensus.load(); + auto* c = consensus.load(); if (c != nullptr) { if ( @@ -710,10 +710,8 @@ namespace ccf } else { - auto c2 = consensus.load(); - bool is_primary = - (c2 == nullptr) || c2->can_replicate(); - const bool forwardable = (c2 != nullptr); + bool is_primary = (c == nullptr) || c->can_replicate(); + const bool forwardable = (c != nullptr); if (!is_primary && forwardable) { @@ -877,9 +875,8 @@ namespace ccf } { - auto c3 = consensus.load(); - auto h = history.load(); - if (c3 != nullptr && c3->can_replicate() && h != nullptr) + auto* h = history.load(); + if (c != nullptr && c->can_replicate() && h != nullptr) { h->try_emit_signature(); } @@ -1026,7 +1023,7 @@ namespace ccf { if (endpoints.request_needs_root(ctx)) { - auto h = history.load(); + auto* h = history.load(); if (h != nullptr) { // Warning: Retrieving the current TxID and root from the history