Skip to content

Commit 588560a

Browse files
Julian Priestleymeta-codesync[bot]
authored andcommitted
Back off when hitting concurrency limit
Summary: If we hit the concurrency limit, all the free worker jobs spin dequeing and requeing jobs - let's add a bit more backoff in this case. Reviewed By: YousefSalama Differential Revision: D103868949 fbshipit-source-id: 85ef88fd2832e5b7d44df55cc74aad69620d1800
1 parent 403ce7c commit 588560a

3 files changed

Lines changed: 22 additions & 4 deletions

File tree

eden/mononoke/features/async_requests/worker_lib/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ rust_library(
1818
"fbsource//third-party/rust:async-stream",
1919
"fbsource//third-party/rust:async-trait",
2020
"fbsource//third-party/rust:futures",
21+
"fbsource//third-party/rust:rand",
2122
"fbsource//third-party/rust:tokio",
2223
"fbsource//third-party/rust:tracing",
2324
"//common/rust/shed/cloned:cloned",

eden/mononoke/features/async_requests/worker_lib/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ mononoke_api = { version = "0.1.0", path = "../../../mononoke_api" }
3131
mononoke_app = { version = "0.1.0", path = "../../../cmdlib/mononoke_app" }
3232
mononoke_macros = { version = "0.1.0", path = "../../../common/mononoke_macros" }
3333
mononoke_types = { version = "0.1.0", path = "../../../mononoke_types" }
34+
rand = "0.10.1"
3435
repo_authorization = { version = "0.1.0", path = "../../../repo_authorization" }
3536
repo_blobstore = { version = "0.1.0", path = "../../../repo_attributes/repo_blobstore" }
3637
repo_derived_data = { version = "0.1.0", path = "../../../repo_attributes/repo_derived_data" }

eden/mononoke/features/async_requests/worker_lib/src/worker.rs

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ use mononoke_app::MononokeReposManager;
5050
use mononoke_macros::mononoke;
5151
use mononoke_types::RepositoryId;
5252
use mononoke_types::Timestamp;
53+
use rand::RngExt as _;
5354
use stats::define_stats;
5455
use stats::prelude::*;
5556
use tracing::debug;
@@ -95,6 +96,8 @@ const DEQUEUE_STREAM_SLEEP_TIME: u64 = 1000;
9596
// if it hasn't updated inprogress timestamp
9697
const ABANDONED_REQUEST_THRESHOLD_SECS: i64 = 5 * 60;
9798
const KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
99+
const CONCURRENCY_LIMIT_BACKOFF_BASE: Duration = Duration::from_secs(15);
100+
const CONCURRENCY_LIMIT_BACKOFF_MAX_JITTER_SECS: u64 = 15;
98101

99102
define_stats! {
100103
prefix = "async_requests.worker";
@@ -373,6 +376,11 @@ impl AsyncMethodRequestWorker {
373376
release_ondemand_repo_impl(repo_id, &self.ondemand_repo_refs, &self.repos_mgr);
374377
}
375378

379+
fn concurrency_limit_backoff() -> Duration {
380+
let jitter_secs = rand::rng().random_range(0..=CONCURRENCY_LIMIT_BACKOFF_MAX_JITTER_SECS);
381+
CONCURRENCY_LIMIT_BACKOFF_BASE + Duration::from_secs(jitter_secs)
382+
}
383+
376384
/// Params into stored response. Doesn't mark it as "in progress" (as this is done during dequeueing).
377385
/// Returns true if the result was successfully stored. Returns false if we
378386
/// lost the race (the request table was updated).
@@ -415,17 +423,23 @@ impl AsyncMethodRequestWorker {
415423
root_request_id,
416424
created_by.as_deref(),
417425
);
418-
log_start(&ctx);
419426

420427
// Check concurrency limit for this request type. If exceeded,
421-
// requeue so another worker can try later when capacity frees up.
428+
// hold the claim briefly before requeueing so the same hot request
429+
// does not immediately churn across the whole worker fleet.
422430
match self.queue.concurrency_limit_reached(&ctx, &req_id.1).await {
423431
Ok(true) => {
424432
let row_id = req_id.0;
433+
let backoff = Self::concurrency_limit_backoff();
425434
info!(
426-
"[{}] Concurrency limit reached for {}, requeuing",
427-
&row_id, &req_id.1.0,
435+
"[{}] Concurrency limit reached for {}, backing off for {:?}",
436+
&row_id, &req_id.1.0, backoff,
428437
);
438+
ctx.scuba()
439+
.clone()
440+
.add("backoff_ms", backoff.as_millis() as i64)
441+
.log_with_msg("Request throttled by concurrency limit", None);
442+
tokio::time::sleep(backoff).await;
429443
if let Err(requeue_err) = self.queue.requeue(&ctx, req_id).await {
430444
error!(
431445
"[{}] Failed to requeue request after concurrency limit: {:?}",
@@ -441,6 +455,8 @@ impl AsyncMethodRequestWorker {
441455
_ => {}
442456
}
443457

458+
log_start(&ctx);
459+
444460
// Save refs for cleanup after self is partially moved.
445461
let ondemand_repo_refs = self.ondemand_repo_refs.clone();
446462
let repos_mgr_for_cleanup = self.repos_mgr.clone();

0 commit comments

Comments
 (0)