Skip to content

Commit 47c19ed

Browse files
perf: cache plugin_names.json with lru_cache for O(1) lookups
- Add load_plugin_names() with @lru_cache to read file once at first access - Replace O(n) linear scan with frozenset O(1) membership check - Extract shared tokenize_plugin_name() helper (removes duplicate tokenize()) - Add 10 tests: public API validation + integration tests for cache behavior Fixes #352
1 parent 0c7a866 commit 47c19ed

2 files changed

Lines changed: 190 additions & 19 deletions

File tree

chatbot-core/api/tools/utils.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for the tools package.
33
"""
44

5+
import functools
56
import json
67
import os
78
import re
@@ -193,29 +194,45 @@ def extract_chunks_content(chunks: List[Dict], logger) -> str:
193194
else retrieval_config["empty_context_message"]
194195
)
195196

197+
def tokenize_plugin_name(name: str) -> str:
198+
"""Normalize a plugin name for case/separator-insensitive comparison."""
199+
return name.replace('-', '').replace(' ', '').lower()
200+
201+
202+
@functools.lru_cache(maxsize=1)
203+
def load_plugin_names() -> frozenset:
204+
"""
205+
Load and cache the set of known plugin names (tokenized) from disk.
206+
207+
The JSON file is static data that never changes at runtime, so it is
208+
read once on first access and kept in memory for O(1) lookups.
209+
210+
Returns:
211+
frozenset: A set of tokenized plugin names.
212+
"""
213+
list_plugin_names_path = os.path.join(
214+
os.path.dirname(os.path.abspath(__file__)),
215+
"..", "data", "raw", "plugin_names.json"
216+
)
217+
with open(list_plugin_names_path, "r", encoding="utf-8") as f:
218+
list_plugin_names = json.load(f)
219+
return frozenset(tokenize_plugin_name(name) for name in list_plugin_names)
220+
221+
196222
def is_valid_plugin(plugin_name: str) -> bool:
197223
"""
198224
Checks whether the given plugin name exists in the list of known plugin names.
199225
226+
Uses a cached frozenset for O(1) membership checks instead of
227+
re-reading from disk and performing a linear scan on every call.
228+
200229
Args:
201230
plugin_name (str): The name of the plugin to validate.
202231
203232
Returns:
204233
bool: True if the plugin exists in the list, False otherwise.
205234
"""
206-
def tokenize(item: str) -> str:
207-
item = item.replace('-', '')
208-
return item.replace(' ', '').lower()
209-
list_plugin_names_path = os.path.join(os.path.abspath(__file__),
210-
"..", "..", "data", "raw", "plugin_names.json")
211-
with open(list_plugin_names_path, "r", encoding="utf-8") as f:
212-
list_plugin_names = json.load(f)
213-
214-
for name in list_plugin_names:
215-
if tokenize(plugin_name) == tokenize(name):
216-
return True
217-
218-
return False
235+
return tokenize_plugin_name(plugin_name) in load_plugin_names()
219236

220237
def filter_retrieved_data(
221238
semantic_data: List[Dict],
@@ -234,14 +251,12 @@ def filter_retrieved_data(
234251
Returns:
235252
Tuple[List[Dict], List[Dict]]: Filtered semantic and keyword data.
236253
"""
237-
def tokenize(item: str) -> str:
238-
item = item.replace('-', '')
239-
return item.replace(' ', '').lower()
240-
241254
semantic_filtered_data = [item for item in semantic_data
242-
if tokenize(item["metadata"]["title"]) == tokenize(plugin_name)]
255+
if tokenize_plugin_name(item["metadata"]["title"])
256+
== tokenize_plugin_name(plugin_name)]
243257
keyword_filtered_data = [item for item in keyword_data
244-
if tokenize(item["metadata"]["title"]) == tokenize(plugin_name)]
258+
if tokenize_plugin_name(item["metadata"]["title"])
259+
== tokenize_plugin_name(plugin_name)]
245260

246261
return semantic_filtered_data, keyword_filtered_data
247262

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Unit tests for plugin name caching and validation in tools/utils.py."""
2+
import json
3+
import unittest
4+
from unittest.mock import patch, mock_open
5+
6+
from api.tools.utils import (
7+
is_valid_plugin,
8+
load_plugin_names,
9+
filter_retrieved_data,
10+
)
11+
12+
SAMPLE_PLUGINS = json.dumps(
13+
["git", "blue-ocean", "credentials", "github-branch-source"]
14+
)
15+
16+
17+
class TestIsValidPlugin(unittest.TestCase):
18+
"""Tests for the is_valid_plugin function."""
19+
20+
def setUp(self):
21+
load_plugin_names.cache_clear()
22+
23+
def tearDown(self):
24+
load_plugin_names.cache_clear()
25+
26+
@patch(
27+
"builtins.open",
28+
mock_open(read_data=SAMPLE_PLUGINS),
29+
)
30+
@patch("os.path.dirname", return_value="/fake/dir")
31+
def test_exact_match(self, _mock_dir):
32+
"""Exact plugin name should be valid."""
33+
self.assertTrue(is_valid_plugin("git"))
34+
35+
@patch(
36+
"builtins.open",
37+
mock_open(read_data=SAMPLE_PLUGINS),
38+
)
39+
@patch("os.path.dirname", return_value="/fake/dir")
40+
def test_case_insensitive_match(self, _mock_dir):
41+
"""Plugin names should match case-insensitively."""
42+
self.assertTrue(is_valid_plugin("Git"))
43+
self.assertTrue(is_valid_plugin("GIT"))
44+
45+
@patch(
46+
"builtins.open",
47+
mock_open(read_data=SAMPLE_PLUGINS),
48+
)
49+
@patch("os.path.dirname", return_value="/fake/dir")
50+
def test_hyphen_insensitive_match(self, _mock_dir):
51+
"""Hyphens should be ignored during matching."""
52+
self.assertTrue(is_valid_plugin("blue ocean"))
53+
self.assertTrue(is_valid_plugin("blueocean"))
54+
self.assertTrue(is_valid_plugin("Blue-Ocean"))
55+
56+
@patch(
57+
"builtins.open",
58+
mock_open(read_data=SAMPLE_PLUGINS),
59+
)
60+
@patch("os.path.dirname", return_value="/fake/dir")
61+
def test_invalid_plugin_returns_false(self, _mock_dir):
62+
"""Non-existent plugin name should return False."""
63+
self.assertFalse(is_valid_plugin("nonexistent-plugin"))
64+
self.assertFalse(is_valid_plugin(""))
65+
66+
67+
class TestFilterRetrievedData(unittest.TestCase):
68+
"""Tests for filter_retrieved_data using the shared tokenizer."""
69+
70+
def test_filters_matching_entries(self):
71+
"""Only entries whose title matches the plugin name should remain."""
72+
semantic_data = [
73+
{"metadata": {"title": "blue-ocean"}, "chunk_text": "a"},
74+
{"metadata": {"title": "credentials"}, "chunk_text": "b"},
75+
]
76+
keyword_data = [
77+
{"metadata": {"title": "Blue Ocean"}, "chunk_text": "c"},
78+
{"metadata": {"title": "git"}, "chunk_text": "d"},
79+
]
80+
sem, kw = filter_retrieved_data(
81+
semantic_data, keyword_data, "blue-ocean"
82+
)
83+
self.assertEqual(len(sem), 1)
84+
self.assertEqual(sem[0]["chunk_text"], "a")
85+
self.assertEqual(len(kw), 1)
86+
self.assertEqual(kw[0]["chunk_text"], "c")
87+
88+
def test_returns_empty_when_no_match(self):
89+
"""No results should be returned when nothing matches."""
90+
data = [{"metadata": {"title": "git"}, "chunk_text": "x"}]
91+
sem, kw = filter_retrieved_data(data, data, "nonexistent")
92+
self.assertEqual(len(sem), 0)
93+
self.assertEqual(len(kw), 0)
94+
95+
def test_empty_input(self):
96+
"""Empty input lists should return empty lists."""
97+
sem, kw = filter_retrieved_data([], [], "git")
98+
self.assertEqual(sem, [])
99+
self.assertEqual(kw, [])
100+
101+
102+
class TestPluginNameCacheIntegration(unittest.TestCase):
103+
"""Integration tests verifying lru_cache behaviour through the public API.
104+
105+
These tests confirm that plugin_names.json is read from disk exactly
106+
once, regardless of how many times public functions that depend on the
107+
cached data are called.
108+
"""
109+
110+
def setUp(self):
111+
load_plugin_names.cache_clear()
112+
113+
def tearDown(self):
114+
load_plugin_names.cache_clear()
115+
116+
@patch("os.path.dirname", return_value="/fake/dir")
117+
def test_multiple_is_valid_plugin_calls_read_file_once(self, _mock_dir):
118+
"""Repeated is_valid_plugin() calls should hit the cache, not disk."""
119+
with patch(
120+
"builtins.open", mock_open(read_data=SAMPLE_PLUGINS)
121+
) as mocked_file:
122+
is_valid_plugin("git")
123+
is_valid_plugin("blue-ocean")
124+
is_valid_plugin("nonexistent")
125+
mocked_file.assert_called_once()
126+
127+
@patch("os.path.dirname", return_value="/fake/dir")
128+
def test_cache_shared_across_public_functions(self, _mock_dir):
129+
"""is_valid_plugin and load_plugin_names should share the same cache."""
130+
with patch(
131+
"builtins.open", mock_open(read_data=SAMPLE_PLUGINS)
132+
) as mocked_file:
133+
# First access via is_valid_plugin populates the cache
134+
is_valid_plugin("git")
135+
# Direct call should reuse the cached result
136+
result = load_plugin_names()
137+
self.assertIsInstance(result, frozenset)
138+
mocked_file.assert_called_once()
139+
140+
@patch("os.path.dirname", return_value="/fake/dir")
141+
def test_cache_clear_forces_reload(self, _mock_dir):
142+
"""After cache_clear(), the next call should re-read the file."""
143+
with patch(
144+
"builtins.open", mock_open(read_data=SAMPLE_PLUGINS)
145+
) as mocked_file:
146+
is_valid_plugin("git")
147+
self.assertEqual(mocked_file.call_count, 1)
148+
149+
load_plugin_names.cache_clear()
150+
151+
is_valid_plugin("git")
152+
self.assertEqual(mocked_file.call_count, 2)
153+
154+
155+
if __name__ == "__main__":
156+
unittest.main()

0 commit comments

Comments
 (0)