From cc970b323192251c1c618b5aa60b4466d1edca63 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 7 Apr 2026 16:24:08 -0400
Subject: [PATCH 01/17] Adding Eclipse's AI model

---
 minicheck/inference.py | 40 ++++++++++++++++++++++++++++++++++++++++
 minicheck/minicheck.py | 11 +++++++++++
 2 files changed, 51 insertions(+)
diff --git a/minicheck/inference.py b/minicheck/inference.py
index 074d1b7..6fe75c5 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -291,6 +291,10 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non
         elif model_id == 'Granite-Guardian-3.3-8B':
             self.model_id = 'ibm-granite/granite-guardian-3.3-8b'
             self.operating_mode="gg_hybrid"
+        elif model_id == 'TBD':
+            self.model_id = 'TBD'
+            self.operating_mode="thinking"
+            self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
         else:
             raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'")
 
@@ -374,6 +378,13 @@ def apply_chat_template(self, doc, claim):
             messages = [{"role": "assistant", "content": claim}]
             guardian_config = {"criteria_id": "groundedness"}
             text = self.tokenizer.apply_chat_template(messages, guardian_config = guardian_config, documents=documents, think=True, tokenize=False, add_generation_prompt=True)
+        elif self.operating_mode=="thinking":
+            user_prompt = self.user_prompt.replace("[DOCUMENT]", doc).replace("[CLAIM]", claim)
+            message = [
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": user_prompt},
+            ]
+            text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, enable_thinking=True)
         return text
 
     
@@ -398,6 +409,33 @@ def get_support_prob_hybrid_gg(self, response, marker="score"):
             print("Error:", e)
             support_prob = random.random()
         return support_prob
+    
+    def get_support_prob_thinking(self, response):
+        """probs from vllm inference"""
+        import math
+        support_prob = 0
+
+        try:
+            thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1
+
+            decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token
+
+            while("\n" in decoded_token and thinking_token_index < len(response.outputs[0].token_ids) - 1):
+                thinking_token_index += 1
+                decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token
+
+            if thinking_token_index < len(response.outputs[0].token_ids):
+                start_response_index = thinking_token_index
+        except Exception as e:
+            print("Error:", e)
+            support_prob = random.random()
+
+        for token_prob in response.outputs[0].logprobs[start_response_index].values():
+            decoded_token = token_prob.decoded_token
+            if decoded_token.lower() == 'yes': 
+                support_prob += math.exp(token_prob.logprob)
+        
+        return support_prob
 
 
     def get_all_chunks_per_doc(self, doc, claim):
@@ -469,6 +507,8 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo
             probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))]
         elif self.operating_mode=="gg_hybrid":
             probs_per_chunk_sentence = [self.get_support_prob_hybrid_gg(responses[idx]) for idx in range(len(responses))]
+        elif self.operating_mode=="thinking":
+            probs_per_chunk_sentence = [self.get_support_prob_thinking(responses[idx]) for idx in range(len(responses))]
 
         result_dict = {}
         for index, prob_per_chunk_sentence in zip(doc_claim_indices, probs_per_chunk_sentence):
diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index a163ec6..0945511 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -32,6 +32,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
                     Default: 32768
                 - 'Granite-Guardian-3.3-8B'
                     Default: 32768
+                - 'TBD'
+                    Default: 11468
             For 'Bespoke-MiniCheck-7B', if you have a GPU with low VRAM and get the following:
                 "ValueError: The model's max seq len (XXXX) is larger than the maximum number of 
                 tokens that can be stored in KV cache (YYYY). Try increasing `gpu_memory_utilization` 
@@ -105,6 +107,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
                 enable_prefix_caching=enable_prefix_caching,
                 max_model_len=max_model_len
             )
+        elif model_name == 'TBD':
+            self.model = LLMCheck(
+                model_id=model_name,
+                tensor_parallel_size=tensor_parallel_size,
+                max_tokens=max_tokens,
+                cache_dir=cache_dir,
+                enable_prefix_caching=enable_prefix_caching,
+                max_model_len=max_model_len,
+            )
         
 
     def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[float]:

From 3ce6bc5fe8bfa7f141947bd4a963a27b51d3eefa Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 28 Apr 2026 14:22:28 -0400
Subject: [PATCH 02/17] bypass_model_check

---
 minicheck/minicheck.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 0945511..941ef84 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -6,7 +6,7 @@
 
 
 class MiniCheck:
-    def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None:
+    def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
 
         '''
         Parameters:
@@ -74,8 +74,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
         future grounded fact-checking with much higher throughput and much lower latency.
         '''
 
-        assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \
-            "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']"
+        if not bypass_model_check:
+            assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \
+                "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']"
 
         
         if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']:
@@ -116,6 +117,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
                 enable_prefix_caching=enable_prefix_caching,
                 max_model_len=max_model_len,
             )
+        else:
+            self.model = LLMCheck(
+                model_id=model_name,
+                tensor_parallel_size=tensor_parallel_size,
+                max_tokens=max_tokens,
+                cache_dir=cache_dir,
+                enable_prefix_caching=enable_prefix_caching,
+                max_model_len=max_model_len,
+            )
         
 
     def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[float]:

From 9bc23f7fc3250a81d11060efeac47a9beb1ee3e7 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 28 Apr 2026 14:24:11 -0400
Subject: [PATCH 03/17] Passing bypass model check to scrorer

Added bypass option for model check in scoring method.
---
 minicheck/minicheck.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 941ef84..699e031 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -77,6 +77,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
         if not bypass_model_check:
             assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \
                 "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']"
+        else:
+            self._bypass_model_check = True
 
         
         if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']:
@@ -174,4 +176,4 @@ def _score_inferencer(self, docs, claims, chunk_size):
         return pred_label, max_support_prob, used_chunk, support_prob_per_chunk
     
     def _score_llmcheck(self, docs, claims, chunk_size):
-        return self.model.score(docs, claims, chunk_size)
+        return self.model.score(docs, claims, chunk_size, self._bypass_model_check)

From 8180e6f3969310c6b0fdd754d80efeafa335a745 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 28 Apr 2026 14:25:52 -0400
Subject: [PATCH 04/17] Revert adding bypass model param to minicheck

Removed bypass model check functionality from the code.
---
 minicheck/minicheck.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 699e031..eb3c12b 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -77,9 +77,6 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
         if not bypass_model_check:
             assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B'], \
                 "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']"
-        else:
-            self._bypass_model_check = True
-
         
         if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']:
             self.model = Inferencer(
@@ -176,4 +173,4 @@ def _score_inferencer(self, docs, claims, chunk_size):
         return pred_label, max_support_prob, used_chunk, support_prob_per_chunk
     
     def _score_llmcheck(self, docs, claims, chunk_size):
-        return self.model.score(docs, claims, chunk_size, self._bypass_model_check)
+        return self.model.score(docs, claims, chunk_size)

From 8b99f6102df232d82e8135703ca138a74c8ab5be Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 28 Apr 2026 14:30:40 -0400
Subject: [PATCH 05/17] Add operating_mode to LLMCheck

---
 minicheck/inference.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 6fe75c5..e184cc6 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -271,7 +271,7 @@ def fact_check(self, doc, claim):
 
 class LLMCheck:
 
-    def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
+    def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
         from vllm import LLM, SamplingParams
 
         import logging
@@ -296,7 +296,12 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non
             self.operating_mode="thinking"
             self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
         else:
-            raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'")
+            self.model_id = model_id
+            self.operating_mode=operating_mode
+
+            if operating_model == "thinking":
+                self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
+            #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'")
 
         self.tensor_parallel_size = tensor_parallel_size
         self.max_tokens = max_tokens
@@ -544,4 +549,4 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo
         return pred_label, max_support_prob, used_chunk, support_prob_per_chunk
 
     def split_into_sentences(self, text: str) -> List[str]:
-        return nltk.sent_tokenize(text)
\ No newline at end of file
+        return nltk.sent_tokenize(text)

From 18c3db9e52a090c536c3bb183a389e483269a0d0 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 28 Apr 2026 14:31:21 -0400
Subject: [PATCH 06/17] Add operating_mode parameter to MiniCheck initializer

---
 minicheck/minicheck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index eb3c12b..81db9bd 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -6,7 +6,7 @@
 
 
 class MiniCheck:
-    def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
+    def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
 
         '''
         Parameters:
@@ -119,6 +119,7 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
         else:
             self.model = LLMCheck(
                 model_id=model_name,
+                operating_mode=operating_mode,
                 tensor_parallel_size=tensor_parallel_size,
                 max_tokens=max_tokens,
                 cache_dir=cache_dir,

From df28be042e4bae5e08b2fed5771eda7ffd88d124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?=
 <marcandrelarochelle1820@gmail.com>
Date: Thu, 7 May 2026 16:04:15 -0400
Subject: [PATCH 07/17] Added LoRA Adapter Support

---
 minicheck/inference.py | 24 ++++++++++++++++++++----
 minicheck/minicheck.py | 10 +++++++++-
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index e184cc6..ca376ec 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -271,7 +271,7 @@ def fact_check(self, doc, claim):
 
 class LLMCheck:
 
-    def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
+    def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
         from vllm import LLM, SamplingParams
 
         import logging
@@ -299,10 +299,12 @@ def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, m
             self.model_id = model_id
             self.operating_mode=operating_mode
 
-            if operating_model == "thinking":
+            if operating_mode == "thinking":
                 self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
             #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'")
 
+        self.peft_path = peft_path
+
         self.tensor_parallel_size = tensor_parallel_size
         self.max_tokens = max_tokens
         self.max_model_len = 32768 if max_model_len is None else max_model_len # max input length (prompt + doc)
@@ -338,7 +340,9 @@ def __init__(self, model_id, operating_mode="bespoke", tensor_parallel_size=1, m
             tensor_parallel_size=self.tensor_parallel_size,
             seed=2024,
             max_model_len=self.max_model_len,   # need to be adjusted based on the GPU memory available
-            enable_prefix_caching=self.enable_prefix_caching
+            enable_prefix_caching=self.enable_prefix_caching,
+            max_lora_rank=max_lora_rank,
+            enable_lora=True if peft_path is not None else False
         )
 
         self.tokenizer = self.llm.get_tokenizer()
@@ -507,7 +511,19 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo
             all_prompts.extend(prompts)
             doc_claim_indices.extend([index] * len(prompts))
 
-        responses = self.llm.generate(all_prompts, self.sampling_params) 
+        if self.peft_path is not None:
+            from vllm.lora.request import LoRARequest
+
+            responses = self.llm.generate(
+                all_prompts, 
+                self.sampling_params,
+                lora_request=LoRARequest("lora_adapter", 1, self.peft_path) if self.peft_path else None)
+        else:
+             responses = self.llm.generate(
+                all_prompts, 
+                self.sampling_params)
+
+
         if self.operating_mode=="bespoke":
             probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))]
         elif self.operating_mode=="gg_hybrid":
diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 81db9bd..65d2251 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -6,7 +6,7 @@
 
 
 class MiniCheck:
-    def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
+    def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
 
         '''
         Parameters:
@@ -19,6 +19,12 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke",
             - 'Bespoke-MiniCheck-7B'
             - 'Granite-Guardian-3.3-8B'
             Note: 'Bespoke-MiniCheck-7B' is the most performant fact-checking model in the MiniCheck series.
+
+        peft_path : str optional (default=None)
+            Path to the PEFT adapter
+
+        max_lora_rank : int optional (default=16)
+            Maximum LoRA Adapter Rank to load
         
         max_model_len : int or None, optional (default=None)
             The maximum input length for the model. If None, we use the following default values. 
@@ -119,6 +125,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', operating_mode="bespoke",
         else:
             self.model = LLMCheck(
                 model_id=model_name,
+                peft_path=peft_path,
+                max_lora_rank=max_lora_rank,
                 operating_mode=operating_mode,
                 tensor_parallel_size=tensor_parallel_size,
                 max_tokens=max_tokens,

From 1452b1d92d2509be0dd4af5bc26fe845c8ff1d2f Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Thu, 7 May 2026 18:07:58 -0400
Subject: [PATCH 08/17] Adjust thinking_end_token assignment logic

Moved the setting of thinking_end_token to a later point in the code based on the operating_mode.
---
 minicheck/inference.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index ca376ec..5e6ecb0 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -299,10 +299,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
             self.model_id = model_id
             self.operating_mode=operating_mode
 
-            if operating_mode == "thinking":
-                self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
-            #raise ValueError("model_id must be 'Bespoke-MiniCheck-7B'")
-
         self.peft_path = peft_path
 
         self.tensor_parallel_size = tensor_parallel_size
@@ -354,6 +350,9 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
         if converted_token is not None:
             terminators.append(converted_token)
 
+        if operating_mode == "thinking":
+            self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
+        
         self.sampling_params = SamplingParams(
             temperature=0,
             max_tokens=self.max_tokens,

From 39a1b9ea672e5ceaa16d6c8a854dc09d544fc9a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?=
 <marcandrelarochelle1820@gmail.com>
Date: Thu, 7 May 2026 22:13:54 -0400
Subject: [PATCH 09/17] Fix bug

---
 minicheck/inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 5e6ecb0..7bc8f42 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -422,6 +422,7 @@ def get_support_prob_thinking(self, response):
         """probs from vllm inference"""
         import math
         support_prob = 0
+        start_response_index = 0
 
         try:
             thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1

From a43b9834c06e6c9bcc226587b5953c534c508668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Larochelle?=
 <marcandrelarochelle1820@gmail.com>
Date: Thu, 7 May 2026 22:16:04 -0400
Subject: [PATCH 10/17] Fix

---
 minicheck/inference.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 7bc8f42..e846ccd 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -435,14 +435,14 @@ def get_support_prob_thinking(self, response):
 
             if thinking_token_index < len(response.outputs[0].token_ids):
                 start_response_index = thinking_token_index
+
+            for token_prob in response.outputs[0].logprobs[start_response_index].values():
+                decoded_token = token_prob.decoded_token
+                if decoded_token.lower() == 'yes': 
+                    support_prob += math.exp(token_prob.logprob)
         except Exception as e:
             print("Error:", e)
             support_prob = random.random()
-
-        for token_prob in response.outputs[0].logprobs[start_response_index].values():
-            decoded_token = token_prob.decoded_token
-            if decoded_token.lower() == 'yes': 
-                support_prob += math.exp(token_prob.logprob)
         
         return support_prob
 

From 58b6158b5b426c9439e919e318c2ecd117f398ba Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Mon, 11 May 2026 14:41:08 -0400
Subject: [PATCH 11/17] Refactor support probability calculation on error

Refactor error handling to include support probability calculation.
---
 minicheck/inference.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index e846ccd..7b1183e 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -442,7 +442,11 @@ def get_support_prob_thinking(self, response):
                     support_prob += math.exp(token_prob.logprob)
         except Exception as e:
             print("Error:", e)
-            support_prob = random.random()
+            
+            for token_prob in response.outputs[0].logprobs[-1].values():
+                decoded_token = token_prob.decoded_token
+                if decoded_token.lower() == 'yes': 
+                    support_prob += math.exp(token_prob.logprob)
         
         return support_prob
 

From c1abb8dbeac262498544f14374b38a30dead028a Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 19 May 2026 13:58:52 -0400
Subject: [PATCH 12/17] Adding support for more models and operating modes

---
 minicheck/inference.py | 57 ++++++++++++++++++++--------------------
 minicheck/minicheck.py | 59 ++++++++++++++++++++++++++++++------------
 2 files changed, 71 insertions(+), 45 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 7b1183e..f60224e 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -271,7 +271,7 @@ def fact_check(self, doc, claim):
 
 class LLMCheck:
 
-    def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
+    def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="bespoke", think_end_token="</think>", extra_chat_template_kwargs=None, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
         from vllm import LLM, SamplingParams
 
         import logging
@@ -291,15 +291,19 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
         elif model_id == 'Granite-Guardian-3.3-8B':
             self.model_id = 'ibm-granite/granite-guardian-3.3-8b'
             self.operating_mode="gg_hybrid"
-        elif model_id == 'TBD':
-            self.model_id = 'TBD'
-            self.operating_mode="thinking"
-            self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
+
+            self.extra_chat_template_kwargs = {
+                'guardian_config': {"criteria_id": "groundedness"},
+                'think': True
+            }
         else:
             self.model_id = model_id
             self.operating_mode=operating_mode
+            self.extra_chat_template_kwargs = extra_chat_template_kwargs
+            self.peft_path = peft_path
 
-        self.peft_path = peft_path
+            if operating_mode == "thinking":
+                self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token)
 
         self.tensor_parallel_size = tensor_parallel_size
         self.max_tokens = max_tokens
@@ -338,7 +342,7 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
             max_model_len=self.max_model_len,   # need to be adjusted based on the GPU memory available
             enable_prefix_caching=self.enable_prefix_caching,
             max_lora_rank=max_lora_rank,
-            enable_lora=True if peft_path is not None else False
+            enable_lora=True if self.peft_path is not None else False
         )
 
         self.tokenizer = self.llm.get_tokenizer()
@@ -349,9 +353,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
         converted_token = self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
         if converted_token is not None:
             terminators.append(converted_token)
-
-        if operating_mode == "thinking":
-            self.thinking_end_token=self.tokenizer.convert_tokens_to_ids("</think>")
         
         self.sampling_params = SamplingParams(
             temperature=0,
@@ -380,19 +381,19 @@ def apply_chat_template(self, doc, claim):
                 {"role": "system", "content": self.system_prompt},
                 {"role": "user", "content": user_prompt},
             ]
-            text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
+            text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs)
         elif self.operating_mode=="gg_hybrid":
             documents = [{'doc_id':'0', 'text': doc}]
             messages = [{"role": "assistant", "content": claim}]
-            guardian_config = {"criteria_id": "groundedness"}
-            text = self.tokenizer.apply_chat_template(messages, guardian_config = guardian_config, documents=documents, think=True, tokenize=False, add_generation_prompt=True)
+            text = self.tokenizer.apply_chat_template(messages, documents=documents, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs)
         elif self.operating_mode=="thinking":
             user_prompt = self.user_prompt.replace("[DOCUMENT]", doc).replace("[CLAIM]", claim)
             message = [
                 {"role": "system", "content": self.system_prompt},
                 {"role": "user", "content": user_prompt},
             ]
-            text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, enable_thinking=True)
+            text = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False, **self.extra_chat_template_kwargs)
+
         return text
 
     
@@ -422,32 +423,31 @@ def get_support_prob_thinking(self, response):
         """probs from vllm inference"""
         import math
         support_prob = 0
-        start_response_index = 0
+        start_response_index = -1
+
+        completion = response.outputs[0]
 
         try:
-            thinking_token_index = response.outputs[0].token_ids.index(self.thinking_end_token) + 1
+            if self.thinking_end_token in completion.token_ids:
+                thinking_token_index = completion.token_ids.index(self.thinking_end_token) + 1
 
-            decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token
+                decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token
 
-            while("\n" in decoded_token and thinking_token_index < len(response.outputs[0].token_ids) - 1):
-                thinking_token_index += 1
-                decoded_token = next(iter(response.outputs[0].logprobs[thinking_token_index].values())).decoded_token
+                while("\n" in decoded_token and thinking_token_index < len(completion.token_ids) - 1):
+                    thinking_token_index += 1
+                    decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token
 
-            if thinking_token_index < len(response.outputs[0].token_ids):
-                start_response_index = thinking_token_index
+                if thinking_token_index < len(completion.token_ids):
+                    start_response_index = thinking_token_index
 
-            for token_prob in response.outputs[0].logprobs[start_response_index].values():
+            for token_prob in completion.logprobs[start_response_index].values():
                 decoded_token = token_prob.decoded_token
                 if decoded_token.lower() == 'yes': 
                     support_prob += math.exp(token_prob.logprob)
         except Exception as e:
             print("Error:", e)
+            support_prob = random.random()
             
-            for token_prob in response.outputs[0].logprobs[-1].values():
-                decoded_token = token_prob.decoded_token
-                if decoded_token.lower() == 'yes': 
-                    support_prob += math.exp(token_prob.logprob)
-        
         return support_prob
 
 
@@ -527,7 +527,6 @@ def score(self, docs: List[str], claims: List[str], chunk_size=None) -> List[flo
                 all_prompts, 
                 self.sampling_params)
 
-
         if self.operating_mode=="bespoke":
             probs_per_chunk_sentence = [self.get_support_prob(responses[idx]) for idx in range(len(responses))]
         elif self.operating_mode=="gg_hybrid":
diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 65d2251..9140123 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -6,7 +6,7 @@
 
 
 class MiniCheck:
-    def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
+    def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_rank=16, operating_mode="bespoke", think_end_token=None, extra_chat_template_kwargs=None, max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False, bypass_model_check=False) -> None:
 
         '''
         Parameters:
@@ -20,11 +20,32 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
             - 'Granite-Guardian-3.3-8B'
             Note: 'Bespoke-MiniCheck-7B' is the most performant fact-checking model in the MiniCheck series.
 
-        peft_path : str optional (default=None)
-            Path to the PEFT adapter
+        peft_path : str, optional (default=None)
+            Path to the LLM PEFT adapter
+                - 'Bespoke-MiniCheck-7B'
+                    peft_path: None
+                - 'Granite-Guardian-3.3-8B'
+                    peft_path: None
 
-        max_lora_rank : int optional (default=16)
+        max_lora_rank : int, optional (default=16)
             Maximum LoRA Adapter Rank to load
+
+        operating_mode : str, optional (default='bespoke')
+            LLM model support probability operating mode
+            Preset models use their corresponding operating mode, i.e:
+                - 'Bespoke-MiniCheck-7B'
+                    Operating Mode: 'bespoke'
+                - 'Granite-Guardian-3.3-8B'
+                    Operating Mode: 'gg_hybrid'
+            Extra operating mode:
+                - 'thinking' uses the first logprobs after the thinking delimiter as support probability
+
+        think_end_token : str, optional (default=None)
+            Token used to represent the end of the thinking traces of LLM models
+
+        extra_chat_template_kwargs : dict, optional (default=None)
+            Extra kwargs to forward to the chat template
+            Preset models use their corresponding chat template kwargs
         
         max_model_len : int or None, optional (default=None)
             The maximum input length for the model. If None, we use the following default values. 
@@ -38,8 +59,6 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
                     Default: 32768
                 - 'Granite-Guardian-3.3-8B'
                     Default: 32768
-                - 'TBD'
-                    Default: 11468
             For 'Bespoke-MiniCheck-7B', if you have a GPU with low VRAM and get the following:
                 "ValueError: The model's max seq len (XXXX) is larger than the maximum number of 
                 tokens that can be stored in KV cache (YYYY). Try increasing `gpu_memory_utilization` 
@@ -65,6 +84,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
             Whether to enable prefix caching for 'Bespoke-MiniCheck-7B'. This can improve performance
             when using the same document chunk to fact-check different claims.
 
+        bypass_model_check: bool, optional (default=False)
+            Allows to bypass the model check to run the benchmark on different models with various configuration
+
         Note:
         (1) MiniCheck-Flan-T5-Large (770M) is the best fack-checking model with size < 1B and reaches GPT-4 performance.
         (2) Bespoke-MiniCheck-7B is the most performant fact-checking model in the MiniCheck series AND
@@ -85,6 +107,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
                 "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B', 'Granite-Guardian-3.3-8B']"
         
         if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']:
+            if operating_mode != 'operating_mode' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None:
+                print(f"Forcing default preset configuration for model {model_name}")
+
             self.model = Inferencer(
                 model_name=model_name, 
                 batch_size=batch_size, 
@@ -92,6 +117,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
                 cache_dir=cache_dir
             )
         elif model_name == 'Bespoke-MiniCheck-7B':
+            if operating_mode != 'bespoke' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None:
+                print("Forcing default preset configuration for model Bespoke-MiniCheck-7B")
+
             self.model = LLMCheck(
                 model_id=model_name,
                 tensor_parallel_size=tensor_parallel_size,
@@ -101,6 +129,9 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
                 max_model_len=max_model_len
             )
         elif model_name == 'Granite-Guardian-3.3-8B':
+            if operating_mode != 'gg_hybrid' or extra_chat_template_kwargs is not None or peft_path is not None or think_end_token is not None:
+                print("Forcing default preset configuration for model Granite Guardian 3.3")
+
             if not max_tokens or max_tokens<2048:
                 print("For Granite Guardian 3.3 - fixing the max_tokens to be 2048")
                 max_tokens=2048
@@ -113,26 +144,22 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', peft_path=None, max_lora_r
                 enable_prefix_caching=enable_prefix_caching,
                 max_model_len=max_model_len
             )
-        elif model_name == 'TBD':
-            self.model = LLMCheck(
-                model_id=model_name,
-                tensor_parallel_size=tensor_parallel_size,
-                max_tokens=max_tokens,
-                cache_dir=cache_dir,
-                enable_prefix_caching=enable_prefix_caching,
-                max_model_len=max_model_len,
-            )
         else:
+            if operating_mode == "thinking":
+                assert think_end_token is not None, "'thinking' operating mode requires to specify a 'think_end_token'"
+
             self.model = LLMCheck(
                 model_id=model_name,
                 peft_path=peft_path,
                 max_lora_rank=max_lora_rank,
                 operating_mode=operating_mode,
+                think_end_token=think_end_token,
+                extra_chat_template_kwargs=extra_chat_template_kwargs,
                 tensor_parallel_size=tensor_parallel_size,
                 max_tokens=max_tokens,
                 cache_dir=cache_dir,
                 enable_prefix_caching=enable_prefix_caching,
-                max_model_len=max_model_len,
+                max_model_len=max_model_len
             )
         
 

From 7df8fc158274693fd37401c55bbba7ed1b8b4a46 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 19 May 2026 16:55:26 -0400
Subject: [PATCH 13/17] Init peft_path

---
 minicheck/inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index f60224e..69cdb56 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -300,11 +300,12 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
             self.model_id = model_id
             self.operating_mode=operating_mode
             self.extra_chat_template_kwargs = extra_chat_template_kwargs
-            self.peft_path = peft_path
 
             if operating_mode == "thinking":
                 self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token)
 
+        self.peft_path = peft_path
+
         self.tensor_parallel_size = tensor_parallel_size
         self.max_tokens = max_tokens
         self.max_model_len = 32768 if max_model_len is None else max_model_len # max input length (prompt + doc)

From ecadbfd2be473598c47eaf5c4cc986f6a99d2cce Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 19 May 2026 17:00:09 -0400
Subject: [PATCH 14/17] Add default to bespoke

---
 minicheck/inference.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 69cdb56..6d11db2 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -288,6 +288,8 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
         if model_id == 'Bespoke-MiniCheck-7B':
             self.model_id = 'bespokelabs/Bespoke-MiniCheck-7B'
             self.operating_mode="bespoke"
+
+            self.extra_chat_template_kwargs = {}
         elif model_id == 'Granite-Guardian-3.3-8B':
             self.model_id = 'ibm-granite/granite-guardian-3.3-8b'
             self.operating_mode="gg_hybrid"

From df3d5545a7a31622543520e5ed5ca1202e3034da Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 19 May 2026 17:09:43 -0400
Subject: [PATCH 15/17] Fix kwargs init

---
 minicheck/inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 6d11db2..575353c 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -301,7 +301,8 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
         else:
             self.model_id = model_id
             self.operating_mode=operating_mode
-            self.extra_chat_template_kwargs = extra_chat_template_kwargs
+
+            self.extra_chat_template_kwargs = extra_chat_template_kwargs if extra_chat_template_kwargs is not None else {}
 
             if operating_mode == "thinking":
                 self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token)

From 011727a4fee73cde25930102de41e604d4b12490 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Tue, 19 May 2026 17:46:36 -0400
Subject: [PATCH 16/17] Update thinking end token

---
 minicheck/inference.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 575353c..2ff6068 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -304,9 +304,6 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
 
             self.extra_chat_template_kwargs = extra_chat_template_kwargs if extra_chat_template_kwargs is not None else {}
 
-            if operating_mode == "thinking":
-                self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token)
-
         self.peft_path = peft_path
 
         self.tensor_parallel_size = tensor_parallel_size
@@ -355,8 +352,12 @@ def __init__(self, model_id, peft_path=None, max_lora_rank=16, operating_mode="b
             self.tokenizer.eos_token_id,
         ]
         converted_token = self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+
         if converted_token is not None:
             terminators.append(converted_token)
+
+        if operating_mode == "thinking":
+            self.thinking_end_token=self.tokenizer.convert_tokens_to_ids(think_end_token)
         
         self.sampling_params = SamplingParams(
             temperature=0,

From aef4a8e717d6cdbd052aec5f94a45cdcbdb85b74 Mon Sep 17 00:00:00 2001
From: marcandrelarochelle <marcandrelarochelle1820@gmail.com>
Date: Mon, 25 May 2026 12:04:29 -0400
Subject: [PATCH 17/17] Max token index

---
 minicheck/inference.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/minicheck/inference.py b/minicheck/inference.py
index 2ff6068..8c3c046 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -434,15 +434,16 @@ def get_support_prob_thinking(self, response):
 
         try:
             if self.thinking_end_token in completion.token_ids:
+                max_token_index =  len(completion.token_ids) - 1
                 thinking_token_index = completion.token_ids.index(self.thinking_end_token) + 1
 
                 decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token
 
-                while("\n" in decoded_token and thinking_token_index < len(completion.token_ids) - 1):
+                while("\n" in decoded_token and max_token_index):
                     thinking_token_index += 1
                     decoded_token = next(iter(completion.logprobs[thinking_token_index].values())).decoded_token
 
-                if thinking_token_index < len(completion.token_ids):
+                if thinking_token_index <= max_token_index:
                     start_response_index = thinking_token_index
 
             for token_prob in completion.logprobs[start_response_index].values():