From 0baf267ccc4b93ae476247c4fa14f5f9859873d7 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Tue, 16 Jun 2026 19:42:18 -0400 Subject: [PATCH] Use Flex processing for the OpenAI judge model which will halve its cost in exchange for slower response times and occasional resource unavailability. This doesn't affect benchmarking an OpenAI model. More details on flex tier [here](https://developers.openai.com/api/docs/guides/flex-processing). --- src/eva/metrics/accuracy/faithfulness.py | 1 + src/eva/metrics/base.py | 2 +- tests/fixtures/metric_signatures.json | 12 ++++++------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/eva/metrics/accuracy/faithfulness.py b/src/eva/metrics/accuracy/faithfulness.py index 7b885d21..250eb36b 100644 --- a/src/eva/metrics/accuracy/faithfulness.py +++ b/src/eva/metrics/accuracy/faithfulness.py @@ -60,6 +60,7 @@ class FaithfulnessJudgeMetric(ConversationTextJudgeMetric): ) category = "accuracy" default_model = "us.anthropic.claude-opus-4-6-v1" + default_params = {"max_tokens": 100000} # Drop the OpenAI-only flex tier inherited from TextJudgeMetric. rating_scale = (1, 3) def get_prompt_variables(self, context: MetricContext, transcript_text: str) -> dict[str, Any]: diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py index ca15d8b6..1f8d6ce7 100644 --- a/src/eva/metrics/base.py +++ b/src/eva/metrics/base.py @@ -330,7 +330,7 @@ class TextJudgeMetric(BaseMetric): # Subclasses can override these default_model = "gpt-5.2" - default_params: dict[str, Any] = {"max_tokens": 100000} + default_params: dict[str, Any] = {"max_tokens": 100000, "service_tier": "flex"} rating_scale: tuple[int, int] = (1, 3) # (min, max) def __init__(self, config: dict[str, Any] | None = None): diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index 992ac39b..043e80b3 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -8,7 +8,7 @@ "ConcisenessJudgeMetric": { "name": "conciseness", "prompt_hash": "57caf9a18a3c", - "source_hash": "e39cacd719a2", + "source_hash": "9f183871e45a", "version": "v0.1" }, "ConversationCorrectlyFinishedMetric": { @@ -20,7 +20,7 @@ "ConversationProgressionJudgeMetric": { "name": "conversation_progression", "prompt_hash": "174cdf93b398", - "source_hash": "8d1a32e8a4fe", + "source_hash": "4e5f59c32946", "version": "v0.1" }, "ConversationTimeLimitExceededMetric": { @@ -38,7 +38,7 @@ "FaithfulnessJudgeMetric": { "name": "faithfulness", "prompt_hash": "060e3f1f9e7a", - "source_hash": "7db40c133d8a", + "source_hash": "93a559d0a028", "version": "v0.2" }, "ResponseSpeedMetric": { @@ -56,7 +56,7 @@ "SpeakabilityJudgeMetric": { "name": "speakability", "prompt_hash": "1e4f78cb051e", - "source_hash": "dd4feece62c1", + "source_hash": "c8a781f5795b", "version": "v0.2" }, "SpeechFidelityMetric": { @@ -86,7 +86,7 @@ "TranscriptionAccuracyKeyEntitiesMetric": { "name": "transcription_accuracy_key_entities", "prompt_hash": "aa1a7fdd6df7", - "source_hash": "80a4d892e1f6", + "source_hash": "96271903ded0", "version": "v0.2" }, "TurnTakingMetric": { @@ -98,7 +98,7 @@ "UserBehavioralFidelityMetric": { "name": "user_behavioral_fidelity", "prompt_hash": "21527fa8d79b", - "source_hash": "fded1ad48aa4", + "source_hash": "7f5dcec51568", "version": "v0.1" }, "UserSpeechFidelityMetric": {