Skip to content

Commit 00ae13e

Browse files
LoopedBard3Copilot
andcommitted
Add pod-based crank scheduler prototype
Simplified alternative to PR aspnet#2106's full crank-scheduler. Uses a pod model where machines are fixed groups (SUT + load + DB) instead of individual machines with capability scoring and preferred partners. Key simplifications: - Pods define fixed machine groupings (no role priority/scoring) - Shared machines between pods handled via collision detection - Same greedy longest-job-first bin-packing algorithm - Same Liquid template YAML generation - ~570 lines vs ~2000 lines in the full scheduler Includes: - scripts/pod-scheduler/ (5 Python files + README) - build/benchmarks_ci_pods.json (pod-based config for CI benchmarks) Co-authored-by: Copilot <[email protected]>
1 parent 38eea61 commit 00ae13e

7 files changed

Lines changed: 1039 additions & 0 deletions

File tree

build/benchmarks_ci_pods.json

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
{
2+
"metadata": {
3+
"name": "CI Benchmarks Configuration",
4+
"description": "Pod-based scheduling for continuous integration benchmarks",
5+
"version": "2.0",
6+
"schedule": "0 3/12 * * *",
7+
"queues": [
8+
"citrine1",
9+
"citrine2",
10+
"citrine3",
11+
"mono"
12+
],
13+
"yaml_generation": {
14+
"target_yaml_count": 2,
15+
"schedule_offset_hours": 6
16+
}
17+
},
18+
"pods": [
19+
{
20+
"name": "intel-lin",
21+
"machines": { "sut": "intel-lin", "load": "intel-load", "db": "intel-db" },
22+
"profiles": { "sut": "intel-lin-app", "load": "intel-load-load", "db": "intel-db-db" }
23+
},
24+
{
25+
"name": "intel-win",
26+
"machines": { "sut": "intel-win", "load": "intel-load2", "db": "intel-db" },
27+
"profiles": { "sut": "intel-win-app", "load": "intel-load2-load", "db": "intel-db-db" }
28+
},
29+
{
30+
"name": "gold-lin",
31+
"machines": { "sut": "gold-lin", "load": "gold-load", "db": "gold-db" },
32+
"profiles": { "sut": "gold-lin-app", "load": "gold-load-load", "db": "gold-db-db" }
33+
},
34+
{
35+
"name": "gold-win",
36+
"machines": { "sut": "gold-win", "load": "gold-load2", "db": "gold-db" },
37+
"profiles": { "sut": "gold-win-app", "load": "gold-load2-load", "db": "gold-db-db" }
38+
},
39+
{
40+
"name": "amd-lin2",
41+
"machines": { "sut": "amd-lin2", "load": "gold-load", "db": "gold-db" },
42+
"profiles": { "sut": "amd-lin2-app", "load": "gold-load-load", "db": "gold-db-db" }
43+
},
44+
{
45+
"name": "intel-perflin",
46+
"machines": { "sut": "intel-perflin", "load": "intel-perfload" },
47+
"profiles": { "sut": "intel-perflin-app", "load": "intel-perfload-load" }
48+
}
49+
],
50+
"scenarios": [
51+
{
52+
"name": "Baselines",
53+
"template": "baselines-scenarios.yml",
54+
"type": 2,
55+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win", "amd-lin2"],
56+
"estimated_runtime": 30.0
57+
},
58+
{
59+
"name": "Baselines Database",
60+
"template": "baselines-database-scenarios.yml",
61+
"type": 3,
62+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win", "amd-lin2"],
63+
"estimated_runtime": 45.0
64+
},
65+
{
66+
"name": "Blazor",
67+
"template": "blazor-scenarios.yml",
68+
"type": 2,
69+
"pods": ["gold-lin", "intel-lin", "intel-perflin"],
70+
"estimated_runtime": 45.0
71+
},
72+
{
73+
"name": "Build",
74+
"template": "build-perf-scenarios.yml",
75+
"type": 1,
76+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
77+
"estimated_runtime": 1.0
78+
},
79+
{
80+
"name": "Containers",
81+
"template": "containers-scenarios.yml",
82+
"type": 3,
83+
"pods": ["gold-lin", "intel-lin"],
84+
"estimated_runtime": 90.0
85+
},
86+
{
87+
"name": "Crossgen",
88+
"template": "crossgen-scenarios.yml",
89+
"type": 2,
90+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win", "amd-lin2"],
91+
"estimated_runtime": 5.0
92+
},
93+
{
94+
"name": "Custom Proxies",
95+
"template": "custom-proxies-scenarios.yml",
96+
"type": 3,
97+
"pods": ["gold-lin", "intel-lin"],
98+
"estimated_runtime": 5.0
99+
},
100+
{
101+
"name": "EF Core",
102+
"template": "efcore-scenarios.yml",
103+
"type": 2,
104+
"pods": ["gold-win", "intel-win"],
105+
"estimated_runtime": 15.0
106+
},
107+
{
108+
"name": "Frameworks",
109+
"template": "frameworks-scenarios.yml",
110+
"type": 3,
111+
"pods": ["gold-lin", "intel-lin", "amd-lin2"],
112+
"estimated_runtime": 15.0
113+
},
114+
{
115+
"name": "Frameworks Database",
116+
"template": "frameworks-database-scenarios.yml",
117+
"type": 3,
118+
"pods": ["gold-lin", "intel-lin", "amd-lin2"],
119+
"estimated_runtime": 20.0
120+
},
121+
{
122+
"name": "GC",
123+
"template": "gc-scenarios.yml",
124+
"type": 1,
125+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
126+
"estimated_runtime": 15.0
127+
},
128+
{
129+
"name": "Grpc",
130+
"template": "grpc-scenarios.yml",
131+
"type": 2,
132+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
133+
"estimated_runtime": 70.0
134+
},
135+
{
136+
"name": "HttpClient",
137+
"template": "httpclient-scenarios.yml",
138+
"type": 2,
139+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
140+
"estimated_runtime": 45.0
141+
},
142+
{
143+
"name": "MVC",
144+
"template": "mvc-scenarios.yml",
145+
"type": 3,
146+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
147+
"estimated_runtime": 20.0
148+
},
149+
{
150+
"name": "NativeAOT",
151+
"template": "nativeaot-scenarios.yml",
152+
"type": 3,
153+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
154+
"estimated_runtime": 20.0
155+
},
156+
{
157+
"name": "PGO",
158+
"template": "pgo-scenarios.yml",
159+
"type": 3,
160+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
161+
"estimated_runtime": 90.0
162+
},
163+
{
164+
"name": "Proxies",
165+
"template": "proxies-scenarios.yml",
166+
"type": 3,
167+
"pods": ["gold-lin", "intel-lin"],
168+
"estimated_runtime": 150.0
169+
},
170+
{
171+
"name": "SignalR",
172+
"template": "signalr-scenarios.yml",
173+
"type": 2,
174+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
175+
"estimated_runtime": 30.0
176+
},
177+
{
178+
"name": "Single File",
179+
"template": "single-file-scenarios.yml",
180+
"type": 2,
181+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
182+
"estimated_runtime": 10.0
183+
},
184+
{
185+
"name": "SslStream",
186+
"template": "sslstream-scenarios.yml",
187+
"type": 2,
188+
"pods": ["gold-lin", "intel-lin"],
189+
"estimated_runtime": 45.0
190+
},
191+
{
192+
"name": "Trends",
193+
"template": "trend-scenarios.yml",
194+
"type": 2,
195+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win", "amd-lin2"],
196+
"estimated_runtime": 20.0
197+
},
198+
{
199+
"name": "Trends Database",
200+
"template": "trend-database-scenarios.yml",
201+
"type": 3,
202+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win", "amd-lin2"],
203+
"estimated_runtime": 15.0
204+
},
205+
{
206+
"name": "WebSockets",
207+
"template": "websockets-scenarios.yml",
208+
"type": 2,
209+
"pods": ["gold-lin", "gold-win", "intel-lin", "intel-win"],
210+
"estimated_runtime": 6.0
211+
}
212+
]
213+
}

scripts/pod-scheduler/README.md

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Pod-Based Crank Scheduler
2+
3+
A simplified scheduler for assigning benchmark scenarios to machine "pods" and
4+
generating Azure DevOps pipeline YAML files.
5+
6+
## Concept
7+
8+
A **pod** is a fixed group of machines that always run together:
9+
- **SUT** (System Under Test) — required
10+
- **Load** generator — optional, for dual/triple scenarios
11+
- **DB** (Database) — optional, for triple scenarios
12+
13+
Pods that share physical machines (e.g., two pods using the same DB machine)
14+
cannot run in the same stage. The scheduler handles this automatically.
15+
16+
## Quick Start
17+
18+
```bash
19+
# Show schedule summary
20+
python main.py --config ../../build/benchmarks_ci_pods.json
21+
22+
# Generate pipeline YAML files
23+
python main.py --config ../../build/benchmarks_ci_pods.json \
24+
--template ../../build/benchmarks.template.liquid \
25+
--yaml-output ../../build
26+
27+
# Show which pods share machines
28+
python main.py --config ../../build/benchmarks_ci_pods.json --show-conflicts
29+
30+
# List all runs without scheduling
31+
python main.py --config ../../build/benchmarks_ci_pods.json --list-runs
32+
```
33+
34+
## Configuration Format
35+
36+
```json
37+
{
38+
"metadata": {
39+
"name": "Config Name",
40+
"schedule": "0 3/12 * * *",
41+
"queues": ["citrine1", "citrine2", "citrine3", "mono"],
42+
"yaml_generation": {
43+
"target_yaml_count": 2,
44+
"schedule_offset_hours": 6
45+
}
46+
},
47+
"pods": [
48+
{
49+
"name": "gold-lin",
50+
"machines": { "sut": "gold-lin", "load": "gold-load", "db": "gold-db" },
51+
"profiles": { "sut": "gold-lin-app", "load": "gold-load-load", "db": "gold-db-db" }
52+
}
53+
],
54+
"scenarios": [
55+
{
56+
"name": "Baselines",
57+
"template": "baselines-scenarios.yml",
58+
"type": 2,
59+
"pods": ["gold-lin", "gold-win"],
60+
"estimated_runtime": 30.0
61+
}
62+
]
63+
}
64+
```
65+
66+
### Pod Definition
67+
68+
| Field | Description |
69+
|-------|-------------|
70+
| `name` | Unique identifier for the pod |
71+
| `machines.sut` | Physical machine name for SUT role |
72+
| `machines.load` | Physical machine name for Load role (optional) |
73+
| `machines.db` | Physical machine name for DB role (optional) |
74+
| `profiles.sut` | Crank profile name for SUT |
75+
| `profiles.load` | Crank profile name for Load (optional) |
76+
| `profiles.db` | Crank profile name for DB (optional) |
77+
78+
### Scenario Types
79+
80+
| Type | Machines Used | Example |
81+
|------|--------------|---------|
82+
| 1 (SINGLE) | SUT only | Build, GC |
83+
| 2 (DUAL) | SUT + Load | Baselines, Grpc, SignalR |
84+
| 3 (TRIPLE) | SUT + Load + DB | Baselines Database, PGO, Proxies |
85+
86+
### Handling Shared Machines
87+
88+
Two pods can share load/DB machines. For example:
89+
- `gold-lin` pod: SUT=gold-lin, Load=gold-load, DB=gold-db
90+
- `gold-win` pod: SUT=gold-win, Load=gold-load2, DB=gold-db
91+
92+
These pods share `gold-db`. When both run type-3 scenarios, they cannot be in
93+
the same stage. When `gold-win` runs a type-2 scenario (no DB), there's no
94+
conflict.
95+
96+
### Future: Multiple SUTs per Class
97+
98+
If you get 2 SUT machines of the same class (e.g., gold-lin-1 and gold-lin-2),
99+
create separate pods for each. They can share load/DB:
100+
101+
```json
102+
{"name": "gold-lin-1", "machines": {"sut": "gold-lin-1", "load": "gold-load", "db": "gold-db"}, ...},
103+
{"name": "gold-lin-2", "machines": {"sut": "gold-lin-2", "load": "gold-load", "db": "gold-db"}, ...}
104+
```
105+
106+
The scheduler automatically prevents them from running simultaneously when they
107+
share load/DB machines.
108+
109+
## Algorithm
110+
111+
1. **Expand** each scenario × pod into individual "runs"
112+
2. **Sort** runs by runtime descending (longest-job-first)
113+
3. **Pack** into stages greedily — each run goes into the first stage where no
114+
physical machines conflict and the queue limit isn't exceeded
115+
4. **Split** stages across multiple YAML files using bin-packing for balanced
116+
runtime
117+
118+
## Files
119+
120+
| File | Lines | Purpose |
121+
|------|-------|---------|
122+
| `main.py` | ~160 | CLI entry point, summary display |
123+
| `models.py` | ~115 | Data classes (Pod, Scenario, Run, Stage, Schedule) |
124+
| `scheduler.py` | ~95 | Scheduling algorithm |
125+
| `config_loader.py` | ~50 | JSON config parser |
126+
| `generator.py` | ~150 | YAML generation |
127+
128+
Total: ~570 lines (vs ~2000 in the full crank-scheduler)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
JSON configuration loader for pod-based scheduling.
3+
"""
4+
5+
import json
6+
from typing import Any, Dict
7+
8+
from models import Pod, Scenario, ScenarioType, ScheduleConfig
9+
10+
11+
def load_config(path: str) -> ScheduleConfig:
12+
"""Load and validate a pod-scheduler JSON configuration file."""
13+
with open(path, "r") as f:
14+
data = json.load(f)
15+
16+
metadata = data["metadata"]
17+
yaml_gen = metadata.get("yaml_generation", {})
18+
19+
pods = {}
20+
for pod_data in data["pods"]:
21+
machines = pod_data["machines"]
22+
profiles = pod_data["profiles"]
23+
pod = Pod(
24+
name=pod_data["name"],
25+
sut=machines["sut"],
26+
load=machines.get("load"),
27+
db=machines.get("db"),
28+
sut_profile=profiles["sut"],
29+
load_profile=profiles.get("load"),
30+
db_profile=profiles.get("db"),
31+
)
32+
pods[pod.name] = pod
33+
34+
scenarios = []
35+
for sc_data in data["scenarios"]:
36+
runtime = sc_data.get("estimated_runtime") or 0
37+
scenarios.append(Scenario(
38+
name=sc_data["name"],
39+
template=sc_data["template"],
40+
type=ScenarioType(sc_data["type"]),
41+
pods=sc_data["pods"],
42+
estimated_runtime=float(runtime) if runtime else 0,
43+
))
44+
45+
return ScheduleConfig(
46+
name=metadata.get("name", ""),
47+
schedule=metadata["schedule"],
48+
queues=metadata["queues"],
49+
target_yaml_count=yaml_gen.get("target_yaml_count", 1),
50+
schedule_offset_hours=yaml_gen.get("schedule_offset_hours", 6),
51+
pods=pods,
52+
scenarios=scenarios,
53+
)

0 commit comments

Comments
 (0)