Skip to content

Commit e7bc690

Browse files
perf: cache plugin_names.json with lru_cache for O(1) lookups
- Add _load_plugin_names() with @lru_cache to read file once at first access - Replace O(n) linear scan with frozenset O(1) membership check - Extract shared _tokenize_plugin_name() helper (removes duplicate tokenize()) - Add 15 unit tests covering caching, tokenization, and validation Fixes #352
1 parent 0c7a866 commit e7bc690

2 files changed

Lines changed: 228 additions & 19 deletions

File tree

chatbot-core/api/tools/utils.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for the tools package.
33
"""
44

5+
import functools
56
import json
67
import os
78
import re
@@ -193,29 +194,45 @@ def extract_chunks_content(chunks: List[Dict], logger) -> str:
193194
else retrieval_config["empty_context_message"]
194195
)
195196

197+
def _tokenize_plugin_name(name: str) -> str:
198+
"""Normalize a plugin name for case/separator-insensitive comparison."""
199+
return name.replace('-', '').replace(' ', '').lower()
200+
201+
202+
@functools.lru_cache(maxsize=1)
203+
def _load_plugin_names() -> frozenset:
204+
"""
205+
Load and cache the set of known plugin names (tokenized) from disk.
206+
207+
The JSON file is static data that never changes at runtime, so it is
208+
read once on first access and kept in memory for O(1) lookups.
209+
210+
Returns:
211+
frozenset: A set of tokenized plugin names.
212+
"""
213+
list_plugin_names_path = os.path.join(
214+
os.path.dirname(os.path.abspath(__file__)),
215+
"..", "data", "raw", "plugin_names.json"
216+
)
217+
with open(list_plugin_names_path, "r", encoding="utf-8") as f:
218+
list_plugin_names = json.load(f)
219+
return frozenset(_tokenize_plugin_name(name) for name in list_plugin_names)
220+
221+
196222
def is_valid_plugin(plugin_name: str) -> bool:
197223
"""
198224
Checks whether the given plugin name exists in the list of known plugin names.
199225
226+
Uses a cached frozenset for O(1) membership checks instead of
227+
re-reading from disk and performing a linear scan on every call.
228+
200229
Args:
201230
plugin_name (str): The name of the plugin to validate.
202231
203232
Returns:
204233
bool: True if the plugin exists in the list, False otherwise.
205234
"""
206-
def tokenize(item: str) -> str:
207-
item = item.replace('-', '')
208-
return item.replace(' ', '').lower()
209-
list_plugin_names_path = os.path.join(os.path.abspath(__file__),
210-
"..", "..", "data", "raw", "plugin_names.json")
211-
with open(list_plugin_names_path, "r", encoding="utf-8") as f:
212-
list_plugin_names = json.load(f)
213-
214-
for name in list_plugin_names:
215-
if tokenize(plugin_name) == tokenize(name):
216-
return True
217-
218-
return False
235+
return _tokenize_plugin_name(plugin_name) in _load_plugin_names()
219236

220237
def filter_retrieved_data(
221238
semantic_data: List[Dict],
@@ -234,14 +251,12 @@ def filter_retrieved_data(
234251
Returns:
235252
Tuple[List[Dict], List[Dict]]: Filtered semantic and keyword data.
236253
"""
237-
def tokenize(item: str) -> str:
238-
item = item.replace('-', '')
239-
return item.replace(' ', '').lower()
240-
241254
semantic_filtered_data = [item for item in semantic_data
242-
if tokenize(item["metadata"]["title"]) == tokenize(plugin_name)]
255+
if _tokenize_plugin_name(item["metadata"]["title"])
256+
== _tokenize_plugin_name(plugin_name)]
243257
keyword_filtered_data = [item for item in keyword_data
244-
if tokenize(item["metadata"]["title"]) == tokenize(plugin_name)]
258+
if _tokenize_plugin_name(item["metadata"]["title"])
259+
== _tokenize_plugin_name(plugin_name)]
245260

246261
return semantic_filtered_data, keyword_filtered_data
247262

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
"""Unit tests for plugin name caching and validation in tools/utils.py."""
2+
import json
3+
import unittest
4+
from unittest.mock import patch, mock_open
5+
6+
from api.tools.utils import (
7+
_tokenize_plugin_name,
8+
_load_plugin_names,
9+
is_valid_plugin,
10+
filter_retrieved_data,
11+
)
12+
13+
14+
class TestTokenizePluginName(unittest.TestCase):
15+
"""Tests for the _tokenize_plugin_name helper."""
16+
17+
def test_converts_to_lowercase(self):
18+
"""Plugin names should be case-insensitive."""
19+
self.assertEqual(_tokenize_plugin_name("Git"), "git")
20+
21+
def test_removes_hyphens(self):
22+
"""Hyphens should be stripped for comparison."""
23+
self.assertEqual(
24+
_tokenize_plugin_name("blue-ocean"), "blueocean"
25+
)
26+
27+
def test_removes_spaces(self):
28+
"""Spaces should be stripped for comparison."""
29+
self.assertEqual(
30+
_tokenize_plugin_name("Blue Ocean"), "blueocean"
31+
)
32+
33+
def test_removes_hyphens_and_spaces(self):
34+
"""Both hyphens and spaces should be stripped."""
35+
self.assertEqual(
36+
_tokenize_plugin_name("my-cool plugin"), "mycoolplugin"
37+
)
38+
39+
def test_empty_string(self):
40+
"""Empty string should return empty string."""
41+
self.assertEqual(_tokenize_plugin_name(""), "")
42+
43+
44+
class TestLoadPluginNames(unittest.TestCase):
45+
"""Tests for the cached _load_plugin_names function."""
46+
47+
def setUp(self):
48+
"""Clear lru_cache before each test to ensure isolation."""
49+
_load_plugin_names.cache_clear()
50+
51+
def tearDown(self):
52+
"""Clear lru_cache after each test."""
53+
_load_plugin_names.cache_clear()
54+
55+
@patch(
56+
"builtins.open",
57+
mock_open(read_data=json.dumps(["git", "Blue-Ocean", "credentials"])),
58+
)
59+
@patch("os.path.dirname", return_value="/fake/dir")
60+
def test_returns_frozenset(self, _mock_dir):
61+
"""Result should be a frozenset for O(1) lookups."""
62+
result = _load_plugin_names()
63+
self.assertIsInstance(result, frozenset)
64+
65+
@patch(
66+
"builtins.open",
67+
mock_open(read_data=json.dumps(["git", "Blue-Ocean", "credentials"])),
68+
)
69+
@patch("os.path.dirname", return_value="/fake/dir")
70+
def test_tokenizes_all_names(self, _mock_dir):
71+
"""All names should be tokenized (lowercased, no hyphens/spaces)."""
72+
result = _load_plugin_names()
73+
self.assertIn("git", result)
74+
self.assertIn("blueocean", result)
75+
self.assertIn("credentials", result)
76+
77+
@patch(
78+
"builtins.open",
79+
mock_open(read_data=json.dumps(["git", "Blue-Ocean", "credentials"])),
80+
)
81+
@patch("os.path.dirname", return_value="/fake/dir")
82+
def test_caching_reads_file_once(self, _mock_dir):
83+
"""Calling _load_plugin_names twice should only open the file once."""
84+
with patch("builtins.open", mock_open(
85+
read_data=json.dumps(["git"])
86+
)) as mocked_file:
87+
_load_plugin_names()
88+
_load_plugin_names()
89+
mocked_file.assert_called_once()
90+
91+
92+
class TestIsValidPlugin(unittest.TestCase):
93+
"""Tests for the is_valid_plugin function."""
94+
95+
def setUp(self):
96+
_load_plugin_names.cache_clear()
97+
98+
def tearDown(self):
99+
_load_plugin_names.cache_clear()
100+
101+
@patch(
102+
"builtins.open",
103+
mock_open(
104+
read_data=json.dumps(
105+
["git", "blue-ocean", "credentials", "github-branch-source"]
106+
)
107+
),
108+
)
109+
@patch("os.path.dirname", return_value="/fake/dir")
110+
def test_exact_match(self, _mock_dir):
111+
"""Exact plugin name should be valid."""
112+
self.assertTrue(is_valid_plugin("git"))
113+
114+
@patch(
115+
"builtins.open",
116+
mock_open(
117+
read_data=json.dumps(
118+
["git", "blue-ocean", "credentials", "github-branch-source"]
119+
)
120+
),
121+
)
122+
@patch("os.path.dirname", return_value="/fake/dir")
123+
def test_case_insensitive_match(self, _mock_dir):
124+
"""Plugin names should match case-insensitively."""
125+
self.assertTrue(is_valid_plugin("Git"))
126+
self.assertTrue(is_valid_plugin("GIT"))
127+
128+
@patch(
129+
"builtins.open",
130+
mock_open(
131+
read_data=json.dumps(
132+
["git", "blue-ocean", "credentials", "github-branch-source"]
133+
)
134+
),
135+
)
136+
@patch("os.path.dirname", return_value="/fake/dir")
137+
def test_hyphen_insensitive_match(self, _mock_dir):
138+
"""Hyphens should be ignored during matching."""
139+
self.assertTrue(is_valid_plugin("blue ocean"))
140+
self.assertTrue(is_valid_plugin("blueocean"))
141+
self.assertTrue(is_valid_plugin("Blue-Ocean"))
142+
143+
@patch(
144+
"builtins.open",
145+
mock_open(
146+
read_data=json.dumps(
147+
["git", "blue-ocean", "credentials", "github-branch-source"]
148+
)
149+
),
150+
)
151+
@patch("os.path.dirname", return_value="/fake/dir")
152+
def test_invalid_plugin_returns_false(self, _mock_dir):
153+
"""Non-existent plugin name should return False."""
154+
self.assertFalse(is_valid_plugin("nonexistent-plugin"))
155+
self.assertFalse(is_valid_plugin(""))
156+
157+
158+
class TestFilterRetrievedData(unittest.TestCase):
159+
"""Tests for filter_retrieved_data using the shared tokenizer."""
160+
161+
def test_filters_matching_entries(self):
162+
"""Only entries whose title matches the plugin name should remain."""
163+
semantic_data = [
164+
{"metadata": {"title": "blue-ocean"}, "chunk_text": "a"},
165+
{"metadata": {"title": "credentials"}, "chunk_text": "b"},
166+
]
167+
keyword_data = [
168+
{"metadata": {"title": "Blue Ocean"}, "chunk_text": "c"},
169+
{"metadata": {"title": "git"}, "chunk_text": "d"},
170+
]
171+
sem, kw = filter_retrieved_data(
172+
semantic_data, keyword_data, "blue-ocean"
173+
)
174+
self.assertEqual(len(sem), 1)
175+
self.assertEqual(sem[0]["chunk_text"], "a")
176+
self.assertEqual(len(kw), 1)
177+
self.assertEqual(kw[0]["chunk_text"], "c")
178+
179+
def test_returns_empty_when_no_match(self):
180+
"""No results should be returned when nothing matches."""
181+
data = [{"metadata": {"title": "git"}, "chunk_text": "x"}]
182+
sem, kw = filter_retrieved_data(data, data, "nonexistent")
183+
self.assertEqual(len(sem), 0)
184+
self.assertEqual(len(kw), 0)
185+
186+
def test_empty_input(self):
187+
"""Empty input lists should return empty lists."""
188+
sem, kw = filter_retrieved_data([], [], "git")
189+
self.assertEqual(sem, [])
190+
self.assertEqual(kw, [])
191+
192+
193+
if __name__ == "__main__":
194+
unittest.main()

0 commit comments

Comments
 (0)