feat: retrival add openalex client#427
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a new OpenAlex backend ('OpenAlexClient') for retrieval evaluation, along with corresponding documentation and unit tests. The review feedback highlights several key improvements: securing the API key transmission by passing it in request headers instead of query parameters (and updating tests accordingly), safely casting 'rate_limit' to a float to prevent type errors, adding robust type checks for API response parsing, and guarding against a potential division-by-zero error when parsing relevance scores.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| if self.search_type == "semantic" and rate_limit <= 0: | ||
| rate_limit = 1.0 | ||
| self.rate_limit = max(0.0, float(rate_limit)) |
There was a problem hiding this comment.
If rate_limit is passed as a string (e.g., from command-line arguments or configuration files), the comparison rate_limit <= 0 will raise a TypeError in Python 3. It is safer to convert rate_limit to a float first before performing any comparisons.
| if self.search_type == "semantic" and rate_limit <= 0: | |
| rate_limit = 1.0 | |
| self.rate_limit = max(0.0, float(rate_limit)) | |
| try: | |
| rate_limit_val = float(rate_limit) | |
| except (TypeError, ValueError): | |
| rate_limit_val = 0.0 | |
| if self.search_type == "semantic" and rate_limit_val <= 0: | |
| rate_limit_val = 1.0 | |
| self.rate_limit = max(0.0, rate_limit_val) |
| self._last_request_time = 0.0 | ||
| self._lock = threading.Lock() | ||
| self._session = self._init_session(max_retries, retry_backoff) |
There was a problem hiding this comment.
Passing sensitive API keys in query parameters is a security risk because query parameters are often logged by web servers, reverse proxies, and browser history. It is much more secure to pass the API key in the request headers. OpenAlex supports passing the API key via the api_key header.
self._last_request_time = 0.0
self._lock = threading.Lock()
self._session = self._init_session(max_retries, retry_backoff)
if self.api_key:
self._session.headers.update({"api_key": self.api_key})| params: dict[str, Any] = { | ||
| search_param: query, | ||
| "per_page": target, | ||
| "select": _DEFAULT_SELECT, | ||
| } | ||
| if self.api_key: | ||
| params["api_key"] = self.api_key | ||
| return params |
There was a problem hiding this comment.
Remove the api_key from the query parameters since it is now passed securely via request headers.
| params: dict[str, Any] = { | |
| search_param: query, | |
| "per_page": target, | |
| "select": _DEFAULT_SELECT, | |
| } | |
| if self.api_key: | |
| params["api_key"] = self.api_key | |
| return params | |
| return { | |
| search_param: query, | |
| "per_page": target, | |
| "select": _DEFAULT_SELECT, | |
| } |
| params = client._build_params("test query", 100) | ||
| assert params["search"] == "test query" | ||
| assert "search.semantic" not in params | ||
| assert params["per_page"] == 100 | ||
| assert params["api_key"] == "test-token" |
There was a problem hiding this comment.
Update the test assertions to verify that the API key is passed securely in the session headers instead of the query parameters.
| params = client._build_params("test query", 100) | |
| assert params["search"] == "test query" | |
| assert "search.semantic" not in params | |
| assert params["per_page"] == 100 | |
| assert params["api_key"] == "test-token" | |
| params = client._build_params("test query", 100) | |
| assert params["search"] == "test query" | |
| assert "search.semantic" not in params | |
| assert params["per_page"] == 100 | |
| assert client._session.headers.get("api_key") == "test-token" |
| params = client._build_params("test query", 100) | ||
| assert params["search.semantic"] == "test query" | ||
| assert "search" not in params | ||
| assert params["per_page"] == 50 | ||
| assert client.rate_limit == 1.0 |
There was a problem hiding this comment.
Update the test assertions to verify that the API key is passed securely in the session headers for semantic search.
| params = client._build_params("test query", 100) | |
| assert params["search.semantic"] == "test query" | |
| assert "search" not in params | |
| assert params["per_page"] == 50 | |
| assert client.rate_limit == 1.0 | |
| params = client._build_params("test query", 100) | |
| assert params["search.semantic"] == "test query" | |
| assert "search" not in params | |
| assert params["per_page"] == 50 | |
| assert client.rate_limit == 1.0 | |
| assert client._session.headers.get("api_key") == "test-token" |
| data = resp.json() | ||
| results = [ | ||
| self._parse_result(item, rank) | ||
| for rank, item in enumerate(data.get("results") or [], start=1) | ||
| if isinstance(item, dict) | ||
| ] |
There was a problem hiding this comment.
If the API returns an unexpected response structure (e.g., a list or a string instead of a dictionary, or if results is missing or not a list), direct access to data.get or enumerate will raise an exception. It is safer to perform type checks before processing the response.
data = resp.json()
results_list = data.get("results") if isinstance(data, dict) else None
if not isinstance(results_list, list):
results_list = []
results = [
self._parse_result(item, rank)
for rank, item in enumerate(results_list, start=1)
if isinstance(item, dict)
]| def _parse_score(item: dict[str, Any], rank: int) -> float: | ||
| value = item.get("relevance_score") | ||
| try: | ||
| return float(value) | ||
| except (TypeError, ValueError): | ||
| return 1.0 / rank |
There was a problem hiding this comment.
If rank is 0, 1.0 / rank will raise a ZeroDivisionError. Although the current search implementation starts ranking at 1, guarding against rank <= 0 is a good defensive programming practice to prevent potential runtime crashes if this method is called from other contexts.
| def _parse_score(item: dict[str, Any], rank: int) -> float: | |
| value = item.get("relevance_score") | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return 1.0 / rank | |
| @staticmethod | |
| def _parse_score(item: dict[str, Any], rank: int) -> float: | |
| value = item.get("relevance_score") | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return 1.0 / rank if rank > 0 else 0.0 |
No description provided.