From 40215cbf63045679e6014738735eb28ab84f6e82 Mon Sep 17 00:00:00 2001 From: ai-hpc Date: Mon, 8 Jun 2026 15:15:08 +0000 Subject: [PATCH] feat: persistent baseline cache (CACHEON_BASELINE_CACHE_DIR) Adds opt-in BaselineCache persistence that saves/loads the vLLM baseline results to disk, eliminating the 12-min baseline container on repeated runs with the same block_hash + baseline_digest + prompt version. - validator/config.py: add CACHEON_BASELINE_CACHE_DIR env var - validator/gpu_eval.py: load cache before run_baseline(), save after - tests/baseline_cache/README.md: format docs + usage examples --- tests/baseline_cache/README.md | 71 ++++++++++++++++++++++++++++ validator/config.py | 8 ++++ validator/gpu_eval.py | 84 +++++++++++++++++++++++++--------- 3 files changed, 141 insertions(+), 22 deletions(-) create mode 100644 tests/baseline_cache/README.md diff --git a/tests/baseline_cache/README.md b/tests/baseline_cache/README.md new file mode 100644 index 0000000..ca1ff2a --- /dev/null +++ b/tests/baseline_cache/README.md @@ -0,0 +1,71 @@ +# Baseline Cache + +Pre-computed baseline eval results for skipping the 12-minute vLLM v0.22.0 container during local development and CI. + +## How it works + +`CACHEON_BASELINE_CACHE_DIR` (added to `validator/config.py`) enables persistent baseline caching: + +- On **first run** with a given `(block_hash, baseline_digest, PROMPT_ENGINE_VERSION)`: + - The vLLM baseline container runs as normal + - `BaselineCache` is serialized to `{cache_dir}/{cache_key}.json` +- On **subsequent runs** with the same key: the JSON is loaded directly, skipping the container entirely (~12 min saved) + +## Cache key derivation + +```python +raw = f"{block_hash}:{baseline_digest}:v{PROMPT_ENGINE_VERSION}:eval" +cache_key = sha256(raw.encode()).hexdigest()[:16] +``` + +## File format + +Each `.json` file is the output of `BaselineCache.to_dict()`: + +```json +{ + "cache_key": "<16-char hex>", + "results": [ + { + "tokens": ["▁Hello", "▁world", ...], + "top_logprobs": [[{"▁Hello": -0.12, ...}], ...], + "ttft_s": 0.195, + "throughput_tps": 92.3, + "output_tokens": 512, + "decode_elapsed_secs": [0.011, 0.012, ...] + }, + ... + ] +} +``` + +## Generating a cache file + +Run a full validator eval with `--baseline-cache-dir` pointed here: + +```bash +python3 scripts/run_validator_eval.py \ + --block-hash 0xdeadbeef1234567890abcdef \ + --miner-image \ + --model-volume /path/to/Qwen2.5-72B-Instruct \ + --gpu-count 8 \ + --state-dir /tmp/state-eval \ + --baseline-cache-dir /path/to/cacheon/tests/baseline_cache +``` + +The file `tests/baseline_cache/.json` will be written after the baseline completes. + +## Using the cache + +```bash +export CACHEON_BASELINE_CACHE_DIR=/path/to/cacheon/tests/baseline_cache +python3 scripts/run_validator_eval.py ... +# OR +python3 -m validator.gpu_eval # reads CACHEON_BASELINE_CACHE_DIR from env +``` + +## Notes + +- The cache is block-hash-specific: different block hashes generate different prompts, so different cache files +- For local dev/CI with a fixed test block hash, one cache file covers all test runs +- The file is ~5-50 MB depending on `EVAL_PROMPT_COUNT` and `top_logprobs` depth diff --git a/validator/config.py b/validator/config.py index 703aa53..777e370 100644 --- a/validator/config.py +++ b/validator/config.py @@ -121,6 +121,14 @@ def _parse_preferred_gpu() -> str: SKIP_S3: bool = os.environ.get("CACHEON_SKIP_S3", "0") == "1" """When True, ``gpu_eval`` skips Hippius S3 download and upload (local pod testing).""" +BASELINE_CACHE_DIR: str = os.environ.get("CACHEON_BASELINE_CACHE_DIR", "") +"""Host directory for persisting baseline eval results (JSON). +When set, gpu_eval saves BaselineCache to ``{dir}/{cache_key}.json`` after a +fresh baseline run and loads from it on subsequent runs with the same +block_hash + baseline_digest + PROMPT_ENGINE_VERSION — skipping the 12-min +vLLM baseline container entirely. +""" + # --------------------------------------------------------------------------- # # Winner defender-advantage window # --------------------------------------------------------------------------- # diff --git a/validator/gpu_eval.py b/validator/gpu_eval.py index dd94270..df78b95 100644 --- a/validator/gpu_eval.py +++ b/validator/gpu_eval.py @@ -15,6 +15,7 @@ CACHEON_BASELINE_DIGEST (required) CACHEON_GPU_COUNT (default: 8, auto-detect via nvidia-smi) CACHEON_VLLM_CACHE_DIR (optional; host path for vLLM torch.compile cache) + CACHEON_BASELINE_CACHE_DIR (optional; host path for persisted baseline JSON cache) HIPPIUS_ACCESS_KEY (required for S3 unless CACHEON_SKIP_S3=1) HIPPIUS_SECRET_KEY (required for S3 unless CACHEON_SKIP_S3=1) CACHEON_SKIP_S3 (default: 0; set 1 for local pod testing without S3) @@ -219,28 +220,67 @@ def main() -> int: update_progress(state_dir, phase="baseline_running", image=baseline_image) _upload_progress(state_dir) - try: - eval_baseline = run_baseline( - eval_prompts, - baseline_image=baseline_image, - baseline_digest=baseline_digest, - model_volume=model_volume, - gpu_count=gpu_count, - block_hash=block_hash, - evaluation_block=block, - state_dir=state_dir, - ) - except Exception as exc: - logger.exception("Baseline failed: %s", exc) - update_progress( - state_dir, - phase="baseline_failed", - error=str(exc), - challengers_affected=len(eval_job.challengers), - ) - _upload_state(state_dir) - _upload_progress(state_dir) - return 4 + + # ------------------------------------------------------------------ + # Baseline cache: skip the 12-min vLLM container if a cached result + # exists for this (block_hash, baseline_digest, prompt_version) key. + # ------------------------------------------------------------------ + import json as _json + from .baseline import BaselineCache, derive_cache_key + + _baseline_cache_dir = validator_config.BASELINE_CACHE_DIR + _cache_key = derive_cache_key(block_hash, baseline_digest) + _cache_file = ( + Path(_baseline_cache_dir) / f"{_cache_key}.json" + if _baseline_cache_dir + else None + ) + + eval_baseline: BaselineCache | None = None + if _cache_file and _cache_file.exists(): + try: + eval_baseline = BaselineCache.from_dict(_json.loads(_cache_file.read_text())) + logger.info( + "Loaded baseline from cache: key=%s file=%s (%d prompts)", + _cache_key, + _cache_file, + len(eval_baseline.results), + ) + except Exception as exc: + logger.warning("Baseline cache load failed (%s), re-running baseline", exc) + eval_baseline = None + + if eval_baseline is None: + try: + eval_baseline = run_baseline( + eval_prompts, + baseline_image=baseline_image, + baseline_digest=baseline_digest, + model_volume=model_volume, + gpu_count=gpu_count, + block_hash=block_hash, + evaluation_block=block, + state_dir=state_dir, + ) + except Exception as exc: + logger.exception("Baseline failed: %s", exc) + update_progress( + state_dir, + phase="baseline_failed", + error=str(exc), + challengers_affected=len(eval_job.challengers), + ) + _upload_state(state_dir) + _upload_progress(state_dir) + return 4 + + if _cache_file: + try: + _cache_file.parent.mkdir(parents=True, exist_ok=True) + _cache_file.write_text(_json.dumps(eval_baseline.to_dict())) + logger.info("Saved baseline to cache: key=%s file=%s", _cache_key, _cache_file) + except Exception as exc: + logger.warning("Baseline cache save failed: %s", exc) update_progress(state_dir, phase="baseline_complete") _upload_state(state_dir)