From cc970b323192251c1c618b5aa60b4466d1edca63 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 7 Apr 2026 16:24:08 -0400 Subject: [PATCH 01/17] Adding Eclipse's AI model --- minicheck/inference.py | 40 ++++++++++++++++++++++++++++++++++++++++ minicheck/minicheck.py | 11 +++++++++++ 2 files changed, 51 insertions(+) diff --git a/minicheck/inference.py b/minicheck/inference.py index 074d1b7..6fe75c5 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -291,6 +291,10 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non elif model_id == 'Granite-Guardian-3.3-8B': self.model_id = 'ibm-granite/granite-guardian-3.3-8b' self.operating_mode="gg_hybrid" + elif model_id == 'TBD': + self.model_id = 'TBD' + self.operating_mode="thinking" + self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") else: raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'") @@ -374,6 +378,13 @@ def apply_chat_template(self, doc, claim): messages = [{"role": "assistant", "content": claim}] guardian_config = {"criteria_id": "groundedness"} text = self.tokenizer.apply_chat_template(messages, guardian_config = guardian_config, documents=documents, think=True, tokenize=False, add_generation_prompt=True) + elif self.operating_mode=="thinking": + user_prompt = self.user_prompt.replace("[DOCUMENT]", doc).replace("[CLAIM]", claim) + message = [ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": user_prompt}, + ] + text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, enable_thinking=True) return text @@ -398,6 +409,33 @@ def get_support_prob_hybrid_gg(self, response, marker="score"): print("Error:", e) support_prob = random.random() return support_prob + + def get_support_prob_thinking(self, response): + """probs from vllm inference""" + import math + support_prob = 0 + + try: + thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1 + + decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token + + while("\n" in decoded_token and thinking_token_index < len(response.outputs[0].token_ids) - 1): + thinking_token_index += 1 + decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token + + if thinking_token_index < len(response.outputs[0].token_ids): + start_response_index = thinking_token_index + except Exception as e: + print("Error:", e) + support_prob = random.random() + + for token_prob in response.outputs[0].logprobs[start_response_index].values(): + decoded_token = token_prob.decoded_token + if decoded_token.lower() == 'yes': + support_prob += math.exp(token_prob.logprob) + + return support_prob def get_all_chunks_per_doc(self, doc, claim): @@ -469,6 +507,8 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))] elif self.operating_mode=="gg_hybrid": probs_per_chunk_sentence = [self.get_support_prob_hybrid_gg(responses[idx]) for idx in range(len(responses))] + elif self.operating_mode=="thinking": + probs_per_chunk_sentence = [self.get_support_prob_thinking(responses[idx]) for idx in range(len(responses))] result_dict = {} for index, prob_per_chunk_sentence in zip(doc_claim_indices, probs_per_chunk_sentence): diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index a163ec6..0945511 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -32,6 +32,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ Default: 32768 - 'Granite-Guardian-3.3-8B' Default: 32768 + - 'TBD' + Default: 11468 For 'Bespoke-MiniCheck-7B', if you have a GPU with low VRAM and get the following: "ValueError: The model's max seq len (XXXX) is larger than the maximum number of tokens that can be stored in KV cache (YYYY). Try increasing `gpu_memory_utilization` @@ -105,6 +107,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ enable_prefix_caching=enable_prefix_caching, max_model_len=max_model_len ) + elif model_name == 'TBD': + self.model = LLMCheck( + model_id=model_name, + tensor_parallel_size=tensor_parallel_size, + max_tokens=max_tokens, + cache_dir=cache_dir, + enable_prefix_caching=enable_prefix_caching, + max_model_len=max_model_len, + ) def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[float]: From 3ce6bc5fe8bfa7f141947bd4a963a27b51d3eefa Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 28 Apr 2026 14:22:28 -0400 Subject: [PATCH 02/17] bypass_model_check --- minicheck/minicheck.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 0945511..941ef84 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -6,7 +6,7 @@ class MiniCheck: - def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None: + def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: ''' Parameters: @@ -74,8 +74,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ future grounded fact-checking with much higher throughput and much lower latency. ''' - assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \ - "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']" + if not bypass_model_check: + assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \ + "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']" if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']: @@ -116,6 +117,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ enable_prefix_caching=enable_prefix_caching, max_model_len=max_model_len, ) + else: + self.model = LLMCheck( + model_id=model_name, + tensor_parallel_size=tensor_parallel_size, + max_tokens=max_tokens, + cache_dir=cache_dir, + enable_prefix_caching=enable_prefix_caching, + max_model_len=max_model_len, + ) def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[float]: From 9bc23f7fc3250a81d11060efeac47a9beb1ee3e7 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 28 Apr 2026 14:24:11 -0400 Subject: [PATCH 03/17] Passing bypass model check to scrorer Added bypass option for model check in scoring method. --- minicheck/minicheck.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 941ef84..699e031 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -77,6 +77,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ if not bypass_model_check: assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \ "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']" + else: + self._bypass_model_check = True if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']: @@ -174,4 +176,4 @@ def _score_inferencer(self, docs, claims, chunk_size): return pred_label, max_support_prob, used_chunk, support_prob_per_chunk def _score_llmcheck(self, docs, claims, chunk_size): - return self.model.score(docs, claims, chunk_size) + return self.model.score(docs, claims, chunk_size, self._bypass_model_check) From 8180e6f3969310c6b0fdd754d80efeafa335a745 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 28 Apr 2026 14:25:52 -0400 Subject: [PATCH 04/17] Revert adding bypass model param to minicheck Removed bypass model check functionality from the code. --- minicheck/minicheck.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 699e031..eb3c12b 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -77,9 +77,6 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ if not bypass_model_check: assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \ "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']" - else: - self._bypass_model_check = True - if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']: self.model = Inferencer( @@ -176,4 +173,4 @@ def _score_inferencer(self, docs, claims, chunk_size): return pred_label, max_support_prob, used_chunk, support_prob_per_chunk def _score_llmcheck(self, docs, claims, chunk_size): - return self.model.score(docs, claims, chunk_size, self._bypass_model_check) + return self.model.score(docs, claims, chunk_size) From 8b99f6102df232d82e8135703ca138a74c8ab5be Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 28 Apr 2026 14:30:40 -0400 Subject: [PATCH 05/17] Add operating_mode to LLMCheck --- minicheck/inference.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 6fe75c5..e184cc6 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -271,7 +271,7 @@ def fact_check(self, doc, claim): class LLMCheck: - def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): + def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): from vllm import LLM, SamplingParams import logging @@ -296,7 +296,12 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non self.operating_mode="thinking" self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") else: - raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'") + self.model_id = model_id + self.operating_mode=operating_mode + + if operating_model == "thinking": + self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") + #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'") self.tensor_parallel_size = tensor_parallel_size self.max_tokens = max_tokens @@ -544,4 +549,4 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo return pred_label, max_support_prob, used_chunk, support_prob_per_chunk def split_into_sentences(self, text: str) -> List[str]: - return nltk.sent_tokenize(text) \ No newline at end of file + return nltk.sent_tokenize(text) From 18c3db9e52a090c536c3bb183a389e483269a0d0 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 28 Apr 2026 14:31:21 -0400 Subject: [PATCH 06/17] Add operating_mode parameter to MiniCheck initializer --- minicheck/minicheck.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index eb3c12b..81db9bd 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -6,7 +6,7 @@ class MiniCheck: - def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: + def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: ''' Parameters: @@ -119,6 +119,7 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ else: self.model = LLMCheck( model_id=model_name, + operating_mode=operating_mode, tensor_parallel_size=tensor_parallel_size, max_tokens=max_tokens, cache_dir=cache_dir, From df28be042e4bae5e08b2fed5771eda7ffd88d124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?= Date: Thu, 7 May 2026 16:04:15 -0400 Subject: [PATCH 07/17] Added LoRA Adapter Support --- minicheck/inference.py | 24 ++++++++++++++++++++---- minicheck/minicheck.py | 10 +++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index e184cc6..ca376ec 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -271,7 +271,7 @@ def fact_check(self, doc, claim): class LLMCheck: - def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): + def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): from vllm import LLM, SamplingParams import logging @@ -299,10 +299,12 @@ def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, m self.model_id = model_id self.operating_mode=operating_mode - if operating_model == "thinking": + if operating_mode == "thinking": self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'") + self.peft_path = peft_path + self.tensor_parallel_size = tensor_parallel_size self.max_tokens = max_tokens self.max_model_len = 32768 if max_model_len is None else max_model_len # max input length (prompt + doc) @@ -338,7 +340,9 @@ def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, m tensor_parallel_size=self.tensor_parallel_size, seed=2024, max_model_len=self.max_model_len, # need to be adjusted based on the GPU memory available - enable_prefix_caching=self.enable_prefix_caching + enable_prefix_caching=self.enable_prefix_caching, + max_lora_rank=max_lora_rank, + enable_lora=True if peft_path is not None else False ) self.tokenizer = self.llm.get_tokenizer() @@ -507,7 +511,19 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo all_prompts.extend(prompts) doc_claim_indices.extend([index] * len(prompts)) - responses = self.llm.generate(all_prompts, self.sampling_params) + if self.peft_path is not None: + from vllm.lora.request import LoRARequest + + responses = self.llm.generate( + all_prompts, + self.sampling_params, + lora_request=LoRARequest("lora_adapter", 1, self.peft_path) if self.peft_path else None) + else: + responses = self.llm.generate( + all_prompts, + self.sampling_params) + + if self.operating_mode=="bespoke": probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))] elif self.operating_mode=="gg_hybrid": diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 81db9bd..65d2251 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -6,7 +6,7 @@ class MiniCheck: - def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: + def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: ''' Parameters: @@ -19,6 +19,12 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", - 'Bespoke-MiniCheck-7B' - 'Granite-Guardian-3.3-8B' Note: 'Bespoke-MiniCheck-7B' is the most performant fact-checking model in the MiniCheck series. + + peft_path : str optional (default=None) + Path to the PEFT adapter + + max_lora_rank : int optional (default=16) + Maximum LoRA Adapter Rank to load max_model_len : int or None, optional (default=None) The maximum input length for the model. If None, we use the following default values. @@ -119,6 +125,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", else: self.model = LLMCheck( model_id=model_name, + peft_path=peft_path, + max_lora_rank=max_lora_rank, operating_mode=operating_mode, tensor_parallel_size=tensor_parallel_size, max_tokens=max_tokens, From 1452b1d92d2509be0dd4af5bc26fe845c8ff1d2f Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Thu, 7 May 2026 18:07:58 -0400 Subject: [PATCH 08/17] Adjust thinking_end_token assignment logic Moved the setting of thinking_end_token to a later point in the code based on the operating_mode. --- minicheck/inference.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index ca376ec..5e6ecb0 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -299,10 +299,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b self.model_id = model_id self.operating_mode=operating_mode - if operating_mode == "thinking": - self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") - #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'") - self.peft_path = peft_path self.tensor_parallel_size = tensor_parallel_size @@ -354,6 +350,9 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b if converted_token is not None: terminators.append(converted_token) + if operating_mode == "thinking": + self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") + self.sampling_params = SamplingParams( temperature=0, max_tokens=self.max_tokens, From 39a1b9ea672e5ceaa16d6c8a854dc09d544fc9a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?= Date: Thu, 7 May 2026 22:13:54 -0400 Subject: [PATCH 09/17] Fix bug --- minicheck/inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/minicheck/inference.py b/minicheck/inference.py index 5e6ecb0..7bc8f42 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -422,6 +422,7 @@ def get_support_prob_thinking(self, response): """probs from vllm inference""" import math support_prob = 0 + start_response_index = 0 try: thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1 From a43b9834c06e6c9bcc226587b5953c534c508668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?= Date: Thu, 7 May 2026 22:16:04 -0400 Subject: [PATCH 10/17] Fix --- minicheck/inference.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 7bc8f42..e846ccd 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -435,14 +435,14 @@ def get_support_prob_thinking(self, response): if thinking_token_index < len(response.outputs[0].token_ids): start_response_index = thinking_token_index + + for token_prob in response.outputs[0].logprobs[start_response_index].values(): + decoded_token = token_prob.decoded_token + if decoded_token.lower() == 'yes': + support_prob += math.exp(token_prob.logprob) except Exception as e: print("Error:", e) support_prob = random.random() - - for token_prob in response.outputs[0].logprobs[start_response_index].values(): - decoded_token = token_prob.decoded_token - if decoded_token.lower() == 'yes': - support_prob += math.exp(token_prob.logprob) return support_prob From 58b6158b5b426c9439e919e318c2ecd117f398ba Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Mon, 11 May 2026 14:41:08 -0400 Subject: [PATCH 11/17] Refactor support probability calculation on error Refactor error handling to include support probability calculation. --- minicheck/inference.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index e846ccd..7b1183e 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -442,7 +442,11 @@ def get_support_prob_thinking(self, response): support_prob += math.exp(token_prob.logprob) except Exception as e: print("Error:", e) - support_prob = random.random() + + for token_prob in response.outputs[0].logprobs[-1].values(): + decoded_token = token_prob.decoded_token + if decoded_token.lower() == 'yes': + support_prob += math.exp(token_prob.logprob) return support_prob From c1abb8dbeac262498544f14374b38a30dead028a Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 19 May 2026 13:58:52 -0400 Subject: [PATCH 12/17] Adding support for more models and operating modes --- minicheck/inference.py | 57 ++++++++++++++++++++-------------------- minicheck/minicheck.py | 59 ++++++++++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 45 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 7b1183e..f60224e 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -271,7 +271,7 @@ def fact_check(self, doc, claim): class LLMCheck: - def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): + def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", think_end_token="", extra_chat_template_kwargs=None, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): from vllm import LLM, SamplingParams import logging @@ -291,15 +291,19 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b elif model_id == 'Granite-Guardian-3.3-8B': self.model_id = 'ibm-granite/granite-guardian-3.3-8b' self.operating_mode="gg_hybrid" - elif model_id == 'TBD': - self.model_id = 'TBD' - self.operating_mode="thinking" - self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") + + self.extra_chat_template_kwargs = { + 'guardian_config': {"criteria_id": "groundedness"}, + 'think': True + } else: self.model_id = model_id self.operating_mode=operating_mode + self.extra_chat_template_kwargs = extra_chat_template_kwargs + self.peft_path = peft_path - self.peft_path = peft_path + if operating_mode == "thinking": + self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token) self.tensor_parallel_size = tensor_parallel_size self.max_tokens = max_tokens @@ -338,7 +342,7 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b max_model_len=self.max_model_len, # need to be adjusted based on the GPU memory available enable_prefix_caching=self.enable_prefix_caching, max_lora_rank=max_lora_rank, - enable_lora=True if peft_path is not None else False + enable_lora=True if self.peft_path is not None else False ) self.tokenizer = self.llm.get_tokenizer() @@ -349,9 +353,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b converted_token = self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if converted_token is not None: terminators.append(converted_token) - - if operating_mode == "thinking": - self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("") self.sampling_params = SamplingParams( temperature=0, @@ -380,19 +381,19 @@ def apply_chat_template(self, doc, claim): {"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}, ] - text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False) + text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs) elif self.operating_mode=="gg_hybrid": documents = [{'doc_id':'0', 'text': doc}] messages = [{"role": "assistant", "content": claim}] - guardian_config = {"criteria_id": "groundedness"} - text = self.tokenizer.apply_chat_template(messages, guardian_config = guardian_config, documents=documents, think=True, tokenize=False, add_generation_prompt=True) + text = self.tokenizer.apply_chat_template(messages, documents=documents, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs) elif self.operating_mode=="thinking": user_prompt = self.user_prompt.replace("[DOCUMENT]", doc).replace("[CLAIM]", claim) message = [ {"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}, ] - text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, enable_thinking=True) + text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs) + return text @@ -422,32 +423,31 @@ def get_support_prob_thinking(self, response): """probs from vllm inference""" import math support_prob = 0 - start_response_index = 0 + start_response_index = -1 + + completion = response.outputs[0] try: - thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1 + if self.thinking_end_token in completion.token_ids: + thinking_token_index = completion.token_ids.index(self.thinking_end_token) + 1 - decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token + decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token - while("\n" in decoded_token and thinking_token_index < len(response.outputs[0].token_ids) - 1): - thinking_token_index += 1 - decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token + while("\n" in decoded_token and thinking_token_index < len(completion.token_ids) - 1): + thinking_token_index += 1 + decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token - if thinking_token_index < len(response.outputs[0].token_ids): - start_response_index = thinking_token_index + if thinking_token_index < len(completion.token_ids): + start_response_index = thinking_token_index - for token_prob in response.outputs[0].logprobs[start_response_index].values(): + for token_prob in completion.logprobs[start_response_index].values(): decoded_token = token_prob.decoded_token if decoded_token.lower() == 'yes': support_prob += math.exp(token_prob.logprob) except Exception as e: print("Error:", e) + support_prob = random.random() - for token_prob in response.outputs[0].logprobs[-1].values(): - decoded_token = token_prob.decoded_token - if decoded_token.lower() == 'yes': - support_prob += math.exp(token_prob.logprob) - return support_prob @@ -527,7 +527,6 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo all_prompts, self.sampling_params) - if self.operating_mode=="bespoke": probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))] elif self.operating_mode=="gg_hybrid": diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 65d2251..9140123 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -6,7 +6,7 @@ class MiniCheck: - def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: + def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", think_end_token=None, extra_chat_template_kwargs=None, max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None: ''' Parameters: @@ -20,11 +20,32 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r - 'Granite-Guardian-3.3-8B' Note: 'Bespoke-MiniCheck-7B' is the most performant fact-checking model in the MiniCheck series. - peft_path : str optional (default=None) - Path to the PEFT adapter + peft_path : str, optional (default=None) + Path to the LLM PEFT adapter + - 'Bespoke-MiniCheck-7B' + peft_path: None + - 'Granite-Guardian-3.3-8B' + peft_path: None - max_lora_rank : int optional (default=16) + max_lora_rank : int, optional (default=16) Maximum LoRA Adapter Rank to load + + operating_mode : str, optional (default='bespoke') + LLM model support probability operating mode + Preset models use their corresponding operating mode, i.e: + - 'Bespoke-MiniCheck-7B' + Operating Mode: 'bespoke' + - 'Granite-Guardian-3.3-8B' + Operating Mode: 'gg_hybrid' + Extra operating mode: + - 'thinking' uses the first logprobs after the thinking delimiter as support probability + + think_end_token : str, optional (default=None) + Token used to represent the end of the thinking traces of LLM models + + extra_chat_template_kwargs : dict, optional (default=None) + Extra kwargs to forward to the chat template + Preset models use their corresponding chat template kwargs max_model_len : int or None, optional (default=None) The maximum input length for the model. If None, we use the following default values. @@ -38,8 +59,6 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r Default: 32768 - 'Granite-Guardian-3.3-8B' Default: 32768 - - 'TBD' - Default: 11468 For 'Bespoke-MiniCheck-7B', if you have a GPU with low VRAM and get the following: "ValueError: The model's max seq len (XXXX) is larger than the maximum number of tokens that can be stored in KV cache (YYYY). Try increasing `gpu_memory_utilization` @@ -65,6 +84,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r Whether to enable prefix caching for 'Bespoke-MiniCheck-7B'. This can improve performance when using the same document chunk to fact-check different claims. + bypass_model_check: bool, optional (default=False) + Allows to bypass the model check to run the benchmark on different models with various configuration + Note: (1) MiniCheck-Flan-T5-Large (770M) is the best fack-checking model with size < 1B and reaches GPT-4 performance. (2) Bespoke-MiniCheck-7B is the most performant fact-checking model in the MiniCheck series AND @@ -85,6 +107,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']" if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']: + if operating_mode != 'operating_mode' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None: + print(f"Forcing default preset configuration for model {model_name}") + self.model = Inferencer( model_name=model_name, batch_size=batch_size, @@ -92,6 +117,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r cache_dir=cache_dir ) elif model_name == 'Bespoke-MiniCheck-7B': + if operating_mode != 'bespoke' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None: + print("Forcing default preset configuration for model Bespoke-MiniCheck-7B") + self.model = LLMCheck( model_id=model_name, tensor_parallel_size=tensor_parallel_size, @@ -101,6 +129,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r max_model_len=max_model_len ) elif model_name == 'Granite-Guardian-3.3-8B': + if operating_mode != 'gg_hybrid' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None: + print("Forcing default preset configuration for model Granite Guardian 3.3") + if not max_tokens or max_tokens<2048: print("For Granite Guardian 3.3 - fixing the max_tokens to be 2048") max_tokens=2048 @@ -113,26 +144,22 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r enable_prefix_caching=enable_prefix_caching, max_model_len=max_model_len ) - elif model_name == 'TBD': - self.model = LLMCheck( - model_id=model_name, - tensor_parallel_size=tensor_parallel_size, - max_tokens=max_tokens, - cache_dir=cache_dir, - enable_prefix_caching=enable_prefix_caching, - max_model_len=max_model_len, - ) else: + if operating_mode == "thinking": + assert think_end_token is not None, "'thinking' operating mode requires to specify a 'think_end_token'" + self.model = LLMCheck( model_id=model_name, peft_path=peft_path, max_lora_rank=max_lora_rank, operating_mode=operating_mode, + think_end_token=think_end_token, + extra_chat_template_kwargs=extra_chat_template_kwargs, tensor_parallel_size=tensor_parallel_size, max_tokens=max_tokens, cache_dir=cache_dir, enable_prefix_caching=enable_prefix_caching, - max_model_len=max_model_len, + max_model_len=max_model_len ) From 7df8fc158274693fd37401c55bbba7ed1b8b4a46 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 19 May 2026 16:55:26 -0400 Subject: [PATCH 13/17] Init peft_path --- minicheck/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index f60224e..69cdb56 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -300,11 +300,12 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b self.model_id = model_id self.operating_mode=operating_mode self.extra_chat_template_kwargs = extra_chat_template_kwargs - self.peft_path = peft_path if operating_mode == "thinking": self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token) + self.peft_path = peft_path + self.tensor_parallel_size = tensor_parallel_size self.max_tokens = max_tokens self.max_model_len = 32768 if max_model_len is None else max_model_len # max input length (prompt + doc) From ecadbfd2be473598c47eaf5c4cc986f6a99d2cce Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 19 May 2026 17:00:09 -0400 Subject: [PATCH 14/17] Add default to bespoke --- minicheck/inference.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/minicheck/inference.py b/minicheck/inference.py index 69cdb56..6d11db2 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -288,6 +288,8 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b if model_id == 'Bespoke-MiniCheck-7B': self.model_id = 'bespokelabs/Bespoke-MiniCheck-7B' self.operating_mode="bespoke" + + self.extra_chat_template_kwargs = {} elif model_id == 'Granite-Guardian-3.3-8B': self.model_id = 'ibm-granite/granite-guardian-3.3-8b' self.operating_mode="gg_hybrid" From df3d5545a7a31622543520e5ed5ca1202e3034da Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 19 May 2026 17:09:43 -0400 Subject: [PATCH 15/17] Fix kwargs init --- minicheck/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 6d11db2..575353c 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -301,7 +301,8 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b else: self.model_id = model_id self.operating_mode=operating_mode - self.extra_chat_template_kwargs = extra_chat_template_kwargs + + self.extra_chat_template_kwargs = extra_chat_template_kwargs if extra_chat_template_kwargs is not None else {} if operating_mode == "thinking": self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token) From 011727a4fee73cde25930102de41e604d4b12490 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Tue, 19 May 2026 17:46:36 -0400 Subject: [PATCH 16/17] Update thinking end token --- minicheck/inference.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 575353c..2ff6068 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -304,9 +304,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b self.extra_chat_template_kwargs = extra_chat_template_kwargs if extra_chat_template_kwargs is not None else {} - if operating_mode == "thinking": - self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token) - self.peft_path = peft_path self.tensor_parallel_size = tensor_parallel_size @@ -355,8 +352,12 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b self.tokenizer.eos_token_id, ] converted_token = self.tokenizer.convert_tokens_to_ids("<|eot_id|>") + if converted_token is not None: terminators.append(converted_token) + + if operating_mode == "thinking": + self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token) self.sampling_params = SamplingParams( temperature=0, From aef4a8e717d6cdbd052aec5f94a45cdcbdb85b74 Mon Sep 17 00:00:00 2001 From: marcandrelarochelle Date: Mon, 25 May 2026 12:04:29 -0400 Subject: [PATCH 17/17] Max token index --- minicheck/inference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/minicheck/inference.py b/minicheck/inference.py index 2ff6068..8c3c046 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -434,15 +434,16 @@ def get_support_prob_thinking(self, response): try: if self.thinking_end_token in completion.token_ids: + max_token_index = len(completion.token_ids) - 1 thinking_token_index = completion.token_ids.index(self.thinking_end_token) + 1 decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token - while("\n" in decoded_token and thinking_token_index < len(completion.token_ids) - 1): + while("\n" in decoded_token and max_token_index): thinking_token_index += 1 decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token - if thinking_token_index < len(completion.token_ids): + if thinking_token_index <= max_token_index: start_response_index = thinking_token_index for token_prob in completion.logprobs[start_response_index].values():