ArcadeData
diff --git a/‎engine/src/main/java/com/arcadedb/GlobalConfiguration.java‎
Lines changed: 61 additions & 11 deletions b/‎engine/src/main/java/com/arcadedb/GlobalConfiguration.java‎
Lines changed: 61 additions & 11 deletions
diff --git a/‎ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftGroupCommitter.java‎
Lines changed: 26 additions & 9 deletions b/‎ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftGroupCommitter.java‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftHAServer.java‎
Lines changed: 63 additions & 12 deletions b/‎ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftHAServer.java‎
Lines changed: 63 additions & 12 deletions
@@ -526,9 +526,16 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
           "Example: localhost:2434:2480:10,192.168.0.1:2434:2480:0",
       String.class, ""),
 
+  HA_SERVER_ROLE("arcadedb.ha.serverRole", SCOPE.SERVER,
+      "Enforces a role in a cluster. 'any' (default) means this node can be elected leader. "
+          + "'replica' sets the Raft peer priority to 0 so the node is never elected leader "
+          + "(useful for read-scale or witness deployments).",
+      String.class, "any", Set.of("any", "replica")),
+
   HA_QUORUM("arcadedb.ha.quorum", SCOPE.SERVER,
-      "Default quorum between 'none', one, two, three, 'majority' and 'all' servers. Default is majority", String.class, "majority",
-      Set.of("none", "one", "two", "three", "majority", "all")),
+      "Write quorum: 'majority' (standard Raft, default) or 'all' (every configured peer must acknowledge). "
+          + "Legacy values 'none', 'one', 'two', 'three' are no longer supported.",
+      String.class, "majority", Set.of("majority", "all")),
 
   HA_QUORUM_TIMEOUT("arcadedb.ha.quorumTimeout", SCOPE.SERVER, "Timeout waiting for the quorum", Long.class, 10000),
 
@@ -544,6 +551,21 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
   HA_APPEND_BUFFER_SIZE("arcadedb.ha.appendBufferSize", SCOPE.SERVER,
       "AppendEntries batch byte limit for replication (e.g. '4MB')", String.class, "4MB"),
 
+  HA_WRITE_BUFFER_SIZE("arcadedb.ha.writeBufferSize", SCOPE.SERVER,
+      "Raft log write buffer size (e.g. '8MB'). Must be at least appendBufferSize + 8 bytes, "
+          + "otherwise the server fails to start with ConfigurationException.",
+      String.class, "8MB"),
+
+  HA_LOG_PURGE_GAP("arcadedb.ha.logPurgeGap", SCOPE.SERVER,
+      "Number of Raft log entries retained after a snapshot as a buffer for slightly lagging followers. "
+          + "Lower values free disk faster but raise the chance a slow follower needs a full snapshot resync.",
+      Integer.class, 1024),
+
+  HA_LOG_PURGE_UPTO_SNAPSHOT("arcadedb.ha.logPurgeUptoSnapshot", SCOPE.SERVER,
+      "When true (default), deletes old Raft log segments after each snapshot to bound disk growth. "
+          + "Set to false to retain full log history for debugging/auditing.",
+      Boolean.class, true),
+
   HA_REPLICATION_CHUNK_MAXSIZE("arcadedb.ha.replicationChunkMaxSize", SCOPE.SERVER,
       "Maximum channel chunk size for replicating messages between servers. Default is 16777216", Integer.class, 16384 * 1024),
 
@@ -574,19 +596,30 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
       "set to true for durable deployments.",
       Boolean.class, false),
 
-  HA_RAFT_SNAPSHOT_THRESHOLD("arcadedb.ha.raftSnapshotThreshold", SCOPE.SERVER,
-      "Number of Raft log entries after which the leader automatically takes a snapshot. " +
-      "Lower values cause more frequent snapshots and earlier log compaction.",
-      Long.class, 10000L),
+  HA_SNAPSHOT_THRESHOLD("arcadedb.ha.snapshotThreshold", SCOPE.SERVER,
+      "Number of Raft log entries after which the leader automatically takes a snapshot. "
+          + "Lower values cause more frequent snapshots and earlier log compaction.",
+      Long.class, 100_000L),
 
   HA_LOG_VERBOSE("arcadedb.ha.logVerbose", SCOPE.SERVER,
       "HA verbose logging level: 0=off, 1=basic (elections, leader changes), 2=detailed (replication, forwarding), 3=trace (every state machine apply)",
       Integer.class, 0),
 
-  HA_RAFT_GROUP_COMMIT_BATCH_SIZE("arcadedb.ha.raftGroupCommitBatchSize", SCOPE.SERVER,
-      "Maximum number of Raft log entries to batch in a single group commit flush. Higher values improve throughput under concurrent load.",
+  HA_GROUP_COMMIT_BATCH_SIZE("arcadedb.ha.groupCommitBatchSize", SCOPE.SERVER,
+      "Maximum number of Raft log entries to batch in a single group commit flush. "
+          + "Higher values improve throughput under concurrent load.",
       Integer.class, 500),
 
+  HA_GROUP_COMMIT_QUEUE_SIZE("arcadedb.ha.groupCommitQueueSize", SCOPE.SERVER,
+      "Maximum pending transactions allowed in the Raft group-commit queue. "
+          + "When the queue is full, the server applies backpressure by throwing ReplicationQueueFullException "
+          + "(a NeedRetryException that clients can retry).",
+      Integer.class, 10_000),
+
+  HA_GROUP_COMMIT_OFFER_TIMEOUT("arcadedb.ha.groupCommitOfferTimeout", SCOPE.SERVER,
+      "Timeout in ms waiting for space in the group-commit queue before throwing ReplicationQueueFullException.",
+      Integer.class, 100),
+
   HA_CLUSTER_TOKEN("arcadedb.ha.clusterToken", SCOPE.SERVER,
       "Shared secret for inter-node request forwarding authentication. " +
       "Must be identical on all cluster nodes. " +
@@ -605,6 +638,10 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
       "Maximum number of concurrent snapshot downloads served by the leader. Requests over this limit receive HTTP 503.",
       Integer.class, 2),
 
+  HA_SNAPSHOT_DOWNLOAD_TIMEOUT("arcadedb.ha.snapshotDownloadTimeout", SCOPE.SERVER,
+      "Read timeout in ms for downloading a database snapshot from the leader during follower resync.",
+      Integer.class, 300_000),
+
   HA_SNAPSHOT_INSTALL_RETRIES("arcadedb.ha.snapshotInstallRetries", SCOPE.SERVER,
       "Maximum retry attempts for snapshot download from the leader during snapshot installation.",
       Integer.class, 3),
@@ -633,10 +670,17 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
       "Delay in milliseconds between RemoteDatabase election retries.",
       Long.class, 2000L),
 
+  HA_RATIS_RESTART_MAX_RETRIES("arcadedb.ha.ratisRestartMaxRetries", SCOPE.SERVER,
+      "Maximum consecutive Ratis restart attempts by the health monitor before the server shuts down "
+          + "for cluster-level recovery. Raise when partition-recovery scenarios cause legitimate rapid restarts.",
+      Integer.class, 10),
+
   HA_STOP_SERVER_ON_REPLICATION_FAILURE("arcadedb.ha.stopServerOnReplicationFailure", SCOPE.SERVER,
-      "If true, stops the JVM after exhausting step-down retries on a phase-2 replication failure. "
-          + "If false, logs CRITICAL but leaves the server running (useful for debugging).",
-      Boolean.class, true),
+      "After a phase-2 local commit fails on the leader while followers have applied the entry, step-down "
+          + "is attempted first. If every step-down fails and this flag is true, the JVM exits so an "
+          + "orchestrator can restart and let Raft log replay correct the state. "
+          + "Default is false: the server keeps running and logs CRITICAL, useful for debugging without an orchestrator.",
+      Boolean.class, false),
 
   HA_SNAPSHOT_WRITE_TIMEOUT("arcadedb.ha.snapshotWriteTimeout", SCOPE.SERVER,
       "Timeout in milliseconds for writing a snapshot to a follower. "
@@ -664,6 +708,12 @@ Enable diagnostic logging during vector graph build progress (heap/off-heap memo
       "Maximum number of entries in the HTTP idempotency cache. Oldest entry is evicted when full.",
       Integer.class, 10_000),
 
+  HA_PEER_ALLOWLIST_ENABLED("arcadedb.ha.peerAllowlist.enabled", SCOPE.SERVER,
+      "Reject inbound Raft gRPC connections whose remote address does not resolve to a host in "
+          + "arcadedb.ha.serverList. Loopback is always allowed. Does not provide peer identity or encryption: "
+          + "use mTLS on untrusted networks.",
+      Boolean.class, true),
+
   HA_GRPC_ALLOWLIST_REFRESH_MS("arcadedb.ha.grpcAllowlistRefreshMs", SCOPE.SERVER,
       "Rate-limiting interval in milliseconds for DNS re-resolution in the gRPC peer address allowlist filter.",
       Long.class, 30_000L),
 
@@ -20,6 +20,7 @@
 
 import com.arcadedb.log.LogManager;
 import com.arcadedb.network.binary.QuorumNotReachedException;
+import com.arcadedb.network.binary.ReplicationQueueFullException;
 import org.apache.ratis.client.RaftClient;
 import org.apache.ratis.proto.RaftProtos;
 import org.apache.ratis.protocol.Message;
@@ -41,24 +42,32 @@
  */
 class RaftGroupCommitter {
 
-  private final    RaftClient                                 raftClient;
-  private final    Quorum                                     quorum;
-  private final    long                                       quorumTimeout;
-  private final    int                                        maxBatchSize;
-  private final    LinkedBlockingQueue<CancellablePendingEntry> queue   = new LinkedBlockingQueue<>();
-  private final    Thread                                     flusher;
-  private volatile boolean                                    running = true;
+  private final    RaftClient                                   raftClient;
+  private final    Quorum                                       quorum;
+  private final    long                                         quorumTimeout;
+  private final    int                                          maxBatchSize;
+  private final    int                                          offerTimeoutMs;
+  private final    LinkedBlockingQueue<CancellablePendingEntry> queue;
+  private final    Thread                                       flusher;
+  private volatile boolean                                      running = true;
 
   RaftGroupCommitter(final RaftClient raftClient, final Quorum quorum, final long quorumTimeout) {
-    this(raftClient, quorum, quorumTimeout, 500);
+    this(raftClient, quorum, quorumTimeout, 500, 10_000, 100);
   }
 
   RaftGroupCommitter(final RaftClient raftClient, final Quorum quorum, final long quorumTimeout,
       final int maxBatchSize) {
+    this(raftClient, quorum, quorumTimeout, maxBatchSize, 10_000, 100);
+  }
+
+  RaftGroupCommitter(final RaftClient raftClient, final Quorum quorum, final long quorumTimeout,
+      final int maxBatchSize, final int maxQueueSize, final int offerTimeoutMs) {
     this.raftClient = raftClient;
     this.quorum = quorum;
     this.quorumTimeout = quorumTimeout;
     this.maxBatchSize = maxBatchSize;
+    this.offerTimeoutMs = offerTimeoutMs;
+    this.queue = new LinkedBlockingQueue<>(maxQueueSize);
     this.flusher = new Thread(this::flushLoop, "arcadedb-raft-group-committer");
     this.flusher.setDaemon(true);
     this.flusher.start();
@@ -67,7 +76,15 @@ class RaftGroupCommitter {
   void submitAndWait(final byte[] entry) {
     final long timeoutMs = 2 * quorumTimeout;
     final CancellablePendingEntry pending = new CancellablePendingEntry(entry);
-    queue.add(pending);
+    try {
+      if (!queue.offer(pending, offerTimeoutMs, TimeUnit.MILLISECONDS))
+        throw new ReplicationQueueFullException(
+            "Replication queue is full (" + queue.remainingCapacity() + " remaining of " + (queue.size()
+                + queue.remainingCapacity()) + " max). Server is overloaded, retry later");
+    } catch (final InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new ReplicationQueueFullException("Interrupted while waiting for replication queue space");
+    }
 
     try {
       final Exception error = pending.future.get(timeoutMs, TimeUnit.MILLISECONDS);
 
@@ -106,6 +106,7 @@ public class RaftHAServer implements HealthMonitor.HealthTarget {
   private volatile LifeCycle.State           forcedStateForTesting = null;
   private          HealthMonitor             healthMonitor;
   private          ClusterTokenProvider      tokenProvider;
+  private volatile int                       restartFailureCount   = 0;
 
   public RaftHAServer(final ArcadeDBServer arcadeServer, final ContextConfiguration configuration) {
     this.arcadeServer = arcadeServer;
@@ -118,11 +119,28 @@ public RaftHAServer(final ArcadeDBServer arcadeServer, final ContextConfiguratio
     final int raftPort = configuration.getValueAsInteger(GlobalConfiguration.HA_RAFT_PORT);
 
     final RaftPeerAddressResolver.ParsedPeerList parsed = RaftPeerAddressResolver.parsePeerList(serverList, raftPort);
-    final List<RaftPeer> peers = parsed.peers();
+    List<RaftPeer> peers = parsed.peers();
     final String serverName = arcadeServer.getServerName();
 
     this.httpAddresses.putAll(parsed.httpAddresses());
     this.localPeerId = RaftPeerAddressResolver.findLocalPeerId(peers, serverName, arcadeServer);
+
+    // If this node is configured as a replica, override its Raft peer priority to 0
+    // so Ratis never elects it as leader (useful for read-scale or witness nodes).
+    final String serverRole = configuration.getValueAsString(GlobalConfiguration.HA_SERVER_ROLE);
+    if ("replica".equalsIgnoreCase(serverRole)) {
+      final List<RaftPeer> rebuilt = new ArrayList<>(peers.size());
+      for (final RaftPeer p : peers) {
+        if (p.getId().equals(localPeerId)) {
+          rebuilt.add(RaftPeer.newBuilder().setId(p.getId()).setAddress(p.getAddress()).setPriority(0).build());
+          LogManager.instance().log(this, Level.INFO,
+              "Node configured as replica (priority=0, will not become leader): %s", localPeerId);
+        } else
+          rebuilt.add(p);
+      }
+      peers = Collections.unmodifiableList(rebuilt);
+    }
+
     this.raftGroup = RaftGroup.valueOf(
         RaftGroupId.valueOf(UUID.nameUUIDFromBytes(clusterName.getBytes(StandardCharsets.UTF_8))),
         peers);
@@ -232,8 +250,10 @@ public void start() throws IOException {
     LogManager.instance()
         .log(this, Level.INFO, "Raft cluster joined: %d nodes %s", peerDisplayNames.size(), peerDisplayNames.values());
 
-    final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_RAFT_GROUP_COMMIT_BATCH_SIZE);
-    transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize);
+    final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_BATCH_SIZE);
+    final int queueSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_QUEUE_SIZE);
+    final int offerTimeout = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_OFFER_TIMEOUT);
+    transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize, queueSize, offerTimeout);
 
     // K8s auto-join: if running in Kubernetes with no existing storage, try to join an existing cluster
     if (configuration.getValueAsBoolean(GlobalConfiguration.HA_K8S) && !hadExistingStorage)
@@ -287,6 +307,19 @@ public void restartRatisIfNeeded() {
         return;
       }
 
+      final int maxRetries = configuration.getValueAsInteger(GlobalConfiguration.HA_RATIS_RESTART_MAX_RETRIES);
+      if (restartFailureCount >= maxRetries) {
+        LogManager.instance().log(this, Level.SEVERE,
+            "Ratis restart failed %d consecutive times (max=%d). Stopping server for cluster-level recovery",
+            restartFailureCount, maxRetries);
+        final Thread stopThread = new Thread(() -> {
+          try { arcadeServer.stop(); } catch (final Exception ignored) {}
+        }, "arcadedb-restart-failure-stop");
+        stopThread.setDaemon(true);
+        stopThread.start();
+        return;
+      }
+
       final RaftClient oldClient = this.raftClient;
       final RaftServer oldServer = this.raftServer;
       final RaftTransactionBroker oldBroker = this.transactionBroker;
@@ -330,12 +363,21 @@ public void restartRatisIfNeeded() {
         this.raftProperties = properties;
         this.raftClient = buildRaftClient(raftGroup, properties);
 
-        final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_RAFT_GROUP_COMMIT_BATCH_SIZE);
-        this.transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize);
+        final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_BATCH_SIZE);
+        final int queueSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_QUEUE_SIZE);
+        final int offerTimeout = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_OFFER_TIMEOUT);
+        this.transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize, queueSize,
+            offerTimeout);
 
+        restartFailureCount = 0;
         HALog.log(this, HALog.BASIC, "Ratis recovered successfully");
       } catch (final Throwable t) {
-        LogManager.instance().log(this, Level.SEVERE, "HealthMonitor recovery failed: %s", t, t.getMessage());
+        restartFailureCount++;
+        LogManager.instance().log(this, Level.SEVERE,
+            "HealthMonitor recovery failed (attempt %d/%d): %s",
+            t, restartFailureCount,
+            configuration.getValueAsInteger(GlobalConfiguration.HA_RATIS_RESTART_MAX_RETRIES),
+            t.getMessage());
       }
     }
   }
@@ -491,9 +533,12 @@ public synchronized void refreshRaftClient(final RaftPeerId knownLeaderId) {
     raftClient = buildRaftClient(raftGroup, raftProperties, knownLeaderId);
 
     if (transactionBroker != null) {
-      final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_RAFT_GROUP_COMMIT_BATCH_SIZE);
+      final int batchSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_BATCH_SIZE);
+      final int queueSize = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_QUEUE_SIZE);
+      final int offerTimeout = configuration.getValueAsInteger(GlobalConfiguration.HA_GROUP_COMMIT_OFFER_TIMEOUT);
       final RaftTransactionBroker oldBroker = transactionBroker;
-      transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize);
+      transactionBroker = new RaftTransactionBroker(raftClient, quorum, quorumTimeout, batchSize, queueSize,
+          offerTimeout);
       oldBroker.stop();
     }
 
@@ -917,14 +962,20 @@ void stopLagMonitor() {
    * connections from IPs not listed in {@code arcadedb.ha.serverList}.
    */
   private static Parameters buildParameters(final ContextConfiguration configuration) {
+    final Parameters parameters = new Parameters();
+    if (!configuration.getValueAsBoolean(GlobalConfiguration.HA_PEER_ALLOWLIST_ENABLED))
+      return parameters;
+
     final String serverList = configuration.getValueAsString(GlobalConfiguration.HA_SERVER_LIST);
     final long refreshMs = configuration.getValueAsLong(GlobalConfiguration.HA_GRPC_ALLOWLIST_REFRESH_MS);
     final List<String> peerHosts = PeerAddressAllowlistFilter.extractPeerHosts(serverList);
-    final Parameters parameters = new Parameters();
-    if (!peerHosts.isEmpty()) {
-      final PeerAddressAllowlistFilter allowlistFilter = new PeerAddressAllowlistFilter(peerHosts, refreshMs);
-      GrpcConfigKeys.Server.setServicesCustomizer(parameters, new RaftGrpcServicesCustomizer(allowlistFilter));
+    if (peerHosts.isEmpty()) {
+      LogManager.instance().log(RaftHAServer.class, Level.WARNING,
+          "arcadedb.ha.peerAllowlist.enabled=true but arcadedb.ha.serverList is empty; allowlist not installed");
+      return parameters;
     }
+    final PeerAddressAllowlistFilter allowlistFilter = new PeerAddressAllowlistFilter(peerHosts, refreshMs);
+    GrpcConfigKeys.Server.setServicesCustomizer(parameters, new RaftGrpcServicesCustomizer(allowlistFilter));
     return parameters;
   }