Skip to content
This repository was archived by the owner on Mar 13, 2026. It is now read-only.

Commit ac99cf0

Browse files
Monitor queued jobs (#26)
1 parent 36750c3 commit ac99cf0

10 files changed

Lines changed: 272 additions & 77 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,33 @@ env:
1313
REGISTRY: ghcr.io
1414

1515
jobs:
16+
test:
17+
runs-on: ubuntu-latest
18+
strategy:
19+
fail-fast: false
20+
matrix:
21+
python-version: ["3.11"]
22+
23+
steps:
24+
- uses: actions/checkout@v3
25+
- name: Set up Python ${{ matrix.python-version }}
26+
uses: actions/setup-python@v4
27+
with:
28+
python-version: ${{ matrix.python-version }}
29+
- name: Install dependencies
30+
run: |
31+
python -m pip install --upgrade pip
32+
pip install -r tests/requirements.txt
33+
- name: Run pre-commit
34+
uses: pre-commit/[email protected]
35+
- name: Test with pytest
36+
run: |
37+
pytest --cov=src tests/
38+
1639
build:
1740
runs-on: ubuntu-latest
1841
name: Build
42+
needs: test
1943
permissions:
2044
contents: read
2145
packages: write
@@ -36,7 +60,7 @@ jobs:
3660
with:
3761
images: ${{ env.REGISTRY }}/${{ github.repository_owner }}/github-workflows-monitoring
3862
flavor: |
39-
latest=true
63+
latest=${{ github.event_name != 'pull_request' }}
4064
tags: |
4165
type=ref,event=branch
4266
type=ref,event=pr
@@ -45,7 +69,7 @@ jobs:
4569
- name: Docker build and push
4670
uses: docker/build-push-action@v4
4771
with:
48-
push: ${{ github.event_name != 'pull_request' }}
72+
push: true
4973
tags: ${{ steps.meta.outputs.tags }}
5074
labels: ${{ steps.meta.outputs.labels }}
5175
context: .

.github/workflows/tests.yaml

Lines changed: 0 additions & 32 deletions
This file was deleted.

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ install_requires =
1010
Flask>=2.2,<3
1111
Flask-APScheduler==1.13.1
1212
datadog==0.49.1
13+
gql[all]==3.5.0
1314

1415
[flake8]
1516
max-line-length = 120

src/app.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from const import GithubHeaders, LOGGING_CONFIG
1111
from github import GithubJob
1212
from jobs import JobEventsHandler
13-
from utils import dict_to_logfmt
13+
from utils import dict_to_logfmt, parse_datetime
14+
from query_graphql import query_jobs
1415

1516
dictConfig(LOGGING_CONFIG)
1617

@@ -125,6 +126,36 @@ def process_workflow_job():
125126
return True
126127

127128

129+
@scheduler.task("interval", id="monitor_jobs", seconds=15)
130+
def monitor_jobs():
131+
with scheduler.app.app_context():
132+
queued_nodes = [job.node_id for job in job_handler.queued.values()]
133+
jobs_data = query_jobs(queued_nodes)
134+
135+
app.logger.info(f"Processing data for jobs {job_handler.queued.keys()}")
136+
137+
for job_data in jobs_data["nodes"]:
138+
job = job_handler.queued.get(job_data["id"])
139+
if job_data["status"] != "QUEUED":
140+
job = job_handler.queued.pop(job_data["id"], None)
141+
app.logger.info(
142+
f"Job {job_data['id']} is no longer queued {job_data['status']}"
143+
)
144+
if job:
145+
job.status = job_data["status"].lower()
146+
job.in_progress_at = parse_datetime(job_data["startedAt"])
147+
job.completed_at = parse_datetime(job_data["completedAt"])
148+
job.final_queued_time_updated = True
149+
if job:
150+
app.logger.info(
151+
f"Sending metric for {job_data['id']} with status {job_data['status']},"
152+
f"duration {job.seconds_in_queue}"
153+
)
154+
job.send_queued_metric()
155+
else:
156+
app.logger.info(f"No job for {job_data['id']}")
157+
158+
128159
@scheduler.task("interval", id="monitor_queued", seconds=30)
129160
def monitor_queued_jobs():
130161
"""Return the job that has been queued and not starting for long time."""

src/github.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,5 +77,13 @@ def runner_group_name(self):
7777
def runner_public(self):
7878
return self.runner_group_name == "GitHub Actions"
7979

80+
@property
81+
def runner_buildjet(self):
82+
return any(item.startswith("buildjet") for item in self.labels)
83+
84+
@property
85+
def labels(self):
86+
return self.data["workflow_job"]["labels"]
87+
8088
def __str__(self):
8189
return f"<{self.id}@{self.name}>"

src/jobs.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Dict
12
import metrics
23

34
from datetime import datetime
@@ -16,6 +17,8 @@ def __init__(self, github_job: GithubJob) -> None:
1617
self._update_attributes(github_job)
1718

1819
self.node_id = self.github_job.node_id
20+
self.labels = "-".join(sorted(self.github_job.labels))
21+
self.final_queued_time_updated = False
1922

2023
@property
2124
def seconds_in_queue(self):
@@ -27,7 +30,7 @@ def seconds_in_queue(self):
2730

2831
def _update_attributes(self, github_job: GithubJob):
2932
self.github_job: GithubJob = github_job
30-
self.status = github_job.action
33+
self.status = self.github_job.action
3134

3235
if self.github_job.action == "queued":
3336
self.queued_at = self.github_job.time_start
@@ -42,11 +45,22 @@ def _update_attributes(self, github_job: GithubJob):
4245
def update(self, github_job: GithubJob):
4346
self._update_attributes(github_job)
4447

48+
def send_queued_metric(self):
49+
metrics.send_queued_job(
50+
seconds_in_queue=self.seconds_in_queue,
51+
job_name=self.github_job.job_name,
52+
status=self.status,
53+
repository=self.github_job.repository,
54+
runner_group_name=self.github_job.runner_group_name,
55+
public=self.github_job.runner_public,
56+
buildjet=self.github_job.runner_buildjet,
57+
)
58+
4559

4660
class JobEventsHandler:
4761
def __init__(self) -> None:
48-
self.queued = dict()
49-
self.in_progress = dict()
62+
self.queued: Dict[str, Job] = dict()
63+
self.in_progress: Dict[str, Job] = dict()
5064

5165
def process_event(self, event: dict):
5266
status = event["action"]
@@ -64,7 +78,7 @@ def process_event(self, event: dict):
6478
pass
6579

6680
def _get_event_job_id(self, event: dict):
67-
return event["workflow_job"]["id"]
81+
return event["workflow_job"]["node_id"]
6882

6983
def _create_job(self, githubJob: GithubJob) -> Job:
7084
return Job(github_job=githubJob)
@@ -81,14 +95,10 @@ def _process_in_progress_event(self, event: dict):
8195
job = self._create_job(GithubJob(event))
8296
else:
8397
job.update(GithubJob(event))
84-
metrics.send_queued_job(
85-
seconds_in_queue=job.seconds_in_queue,
86-
job_name=job.github_job.job_name,
87-
repository=job.github_job.repository,
88-
runner=job.github_job.runner_name,
89-
run_id=job.github_job.run_id,
90-
public=job.github_job.runner_public,
91-
)
98+
# This is a fallover in case the job was not processed during the tracking time.
99+
if not job.final_queued_time_updated:
100+
job.final_queued_time_updated = True
101+
job.send_queued_metric()
92102

93103
self.in_progress[job_id] = job
94104

src/metrics.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1+
import logging
2+
import re
3+
14
from datadog import initialize, statsd
5+
from flask import current_app
26

37
options = {
48
"statsd_host": "datadog-agent.datadog.svc.cluster.local",
@@ -7,23 +11,41 @@
711

812
initialize(**options)
913

14+
logger = logging.getLogger(__name__)
15+
16+
17+
TAG_INVALID_CHARS_RE = re.compile(r"[^\w\d_\-:/\.]", re.UNICODE)
18+
TAG_INVALID_CHARS_SUBS = "_"
19+
20+
21+
def normalize_tags(tag_list):
22+
return [TAG_INVALID_CHARS_RE.sub(TAG_INVALID_CHARS_SUBS, tag) for tag in tag_list]
23+
1024

1125
def send_queued_job(
1226
seconds_in_queue: int,
1327
job_name: str,
28+
status: str,
1429
repository: str,
15-
runner: str,
16-
run_id: str,
1730
public: bool,
31+
buildjet: bool,
32+
runner_group_name: str,
1833
):
19-
statsd.histogram(
20-
"midokura.github_runners.jobs.seconds_in_queue.histogram",
34+
tags = [
35+
f"repository:{repository}",
36+
f"job_name:{job_name}",
37+
f"status:{status}",
38+
f"public:{public}",
39+
f"buildjet:{buildjet}",
40+
f"runner_group_name:{runner_group_name}",
41+
]
42+
43+
tags = normalize_tags(tags)
44+
45+
current_app.logger.info(f"Sending {seconds_in_queue} tags {tags}")
46+
47+
statsd.distribution(
48+
"midokura.github_runners.jobs.seconds_in_queue.distribution",
2149
seconds_in_queue,
22-
tags=[
23-
f"job:{job_name}",
24-
f"repository:{repository}",
25-
f"runner_name:{runner}",
26-
f"run_id:run-{run_id}", # "run-" added to group by run-id in DD
27-
f"public:{public}",
28-
],
50+
tags=tags,
2951
)

src/query_graphql.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
3+
from typing import List
4+
5+
from gql import gql, Client
6+
from gql.transport.aiohttp import AIOHTTPTransport
7+
8+
# Select your transport with a defined url endpoint
9+
headers = {"Authorization": f"bearer {os.getenv('GH_PAT')}"}
10+
transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers=headers)
11+
12+
# Create a GraphQL client using the defined transport
13+
client = Client(transport=transport, fetch_schema_from_transport=True)
14+
15+
16+
# Provide a GraphQL query
17+
def query_jobs(node_id_list: List[str]):
18+
query = gql(
19+
"""
20+
query getCheckRuns($node_id_list: [ID!]!) {
21+
nodes(ids: $node_id_list) {
22+
... on CheckRun {
23+
id
24+
name
25+
status
26+
startedAt
27+
completedAt
28+
repository {
29+
owner {
30+
login
31+
}
32+
name
33+
}
34+
checkSuite {
35+
workflowRun {
36+
event
37+
runNumber
38+
}
39+
}
40+
}
41+
}
42+
}
43+
"""
44+
)
45+
params = {"node_id_list": node_id_list}
46+
47+
return client.execute(query, variable_values=params)

0 commit comments

Comments
 (0)