Skip to content

Commit a69ae3e

Browse files
authored
Refactor JSON struct for robust metadata handling (#140)
* Create a Manifest Pydantic model to hold the respective JSON structure (=manifest.py) => This refactoring of mainly the store_productdocset_json function leverages Pydantic models to create a more robust, clear, and maintainable data pipeline. * Catch Pydantic validation errors & JSON decoding errors * Introduce `*_segment` methods in Doctype model * Adjust LifecycleFlag and lifecycle.py: * Fix type warnings * Use `__missing__` to make it possible to delegate a string to LifecycleFlag * Introduce `upython` in `devel/activate-aliases.sh` * Simplify DocumentFormat * Correct datemodified field, add tests * Remove field_validator for lifecycle This is already covered in #113
1 parent 4f27d39 commit a69ae3e

8 files changed

Lines changed: 460 additions & 104 deletions

File tree

changelog.d/140.refactor.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor JSON structure for robust metadata handling in ``docserv.cli.cmd_metadata.metaprocess:store_productdocset_json``. Introduce Pydantic :class:`~docbuild.models.manifest.Manifest` model to encapsulate document metadata, enhancing validation and serialization.

devel/activate-aliases.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
# For testing:
55
alias upytest="uv run --frozen pytest"
66

7+
# General Python command
8+
alias upython="uv run --frozen python"
9+
710
# For the interactive Python shell with the project's environment:
811
alias uipython="uv run --frozen ipython --ipython-dir=.ipython"
912

src/docbuild/cli/cmd_metadata/metaprocess.py

Lines changed: 34 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
from typing import Any
1010

1111
from lxml import etree
12+
from pydantic import ValidationError
1213
from rich.console import Console
1314

1415
from ...config.xml.stitch import create_stitchfile
1516
from ...constants import DEFAULT_DELIVERABLES
1617
from ...models.deliverable import Deliverable
1718
from ...models.doctype import Doctype
19+
from ...models.manifest import Document, Manifest
1820
from ...utils.contextmgr import PersistentOnErrorTemporaryDirectory, edit_json
1921
from ...utils.git import ManagedGitRepo
2022
from ..context import DocBuildContext
@@ -338,76 +340,51 @@ def store_productdocset_json(
338340
# files: list[Path]
339341
product = doctype.product.value
340342
stdout.print(f" > Processed group: {doctype} / {docset}")
341-
# TODO: This XPath should be done in the Doctype model
342-
# For the time being, it doesn't add to the coverage
343-
productxpath = "./product"
344-
if product != "*": # pragma: no cover
345-
productxpath += f'[@productid="{product}"]'
346-
343+
# The XPath logic is encapsulated within the Doctype model
344+
productxpath = f"./{doctype.product_xpath_segment()}"
347345
productnode = stitchnode.find(productxpath)
348-
docsetxpath = "./docset"
349-
if docset != "*": # pragma: no cover
350-
docsetxpath += f'[@setid="{docset}"]'
351-
346+
docsetxpath = f"./{doctype.docset_xpath_segment(docset)}"
352347
docsetnode = productnode.find(docsetxpath)
353-
# TODO: end
354-
355-
# Create a new structure for each group of product/docset
356-
structure = {
357-
"productname": productnode.find("name").text,
358-
"acronym": product,
359-
"version": docset,
360-
"lifecycle": docsetnode.attrib.get("lifecycle"),
361-
"hide-productname": False,
362-
"descriptions": [
363-
# TODO
364-
# { "lang", "...",
365-
# "default": True|False,
366-
# "description": "..."
367-
# }
368-
],
369-
"categories": [
370-
# TODO
371-
# {
372-
# "categoryId": "...",
373-
# "rank": INT,
374-
# "translations": [
375-
# {
376-
# "lang", "...",
377-
# "default": True|False,
378-
# "title": "..."
379-
# }
380-
# ]
381-
],
382-
"documents": [], # Will be filled below
383-
"archives": [
384-
# TODO
385-
# {
386-
# "lang": "...",
387-
# "default": True|False,
388-
# "zip": "LANG/PRODUCT/DOCSET/PRODUCT-DOCSET-LANG.zip",
389-
# }
390-
],
391-
}
348+
349+
manifest = Manifest(
350+
productname=productnode.find("name").text,
351+
acronym=product,
352+
version=docset,
353+
lifecycle=docsetnode.attrib.get("lifecycle") or "",
354+
# * hide-productname is False by default in the Manifest model
355+
# * descriptions, categories, archives are empty lists by default
356+
)
357+
392358
for f in files:
393359
stdout.print(f" {f}")
394360
try:
395-
with (meta_cache_dir / f).open() as fh:
396-
doc = json.load(fh)
397-
if not doc:
398-
console_err.print(f"Warning: Empty metadata file {f}")
361+
with (meta_cache_dir / f).open(encoding="utf-8") as fh:
362+
loaded_doc_data = json.load(fh)
363+
if not loaded_doc_data:
364+
log.warning("Empty metadata file %s", f)
399365
continue
366+
doc_model = Document.model_validate(loaded_doc_data)
367+
manifest.documents.append(doc_model)
368+
369+
except json.JSONDecodeError as e:
370+
log.error("Error decoding metadata file %s: %s", f, e)
371+
continue
400372

401-
structure["documents"].extend(doc.get("docs", []))
373+
except ValidationError as e:
374+
log.error("Error validating metadata file %s: %s", f, e)
375+
continue
402376

403377
except Exception as e:
404-
console_err.print(f"Error reading metadata file {f}: {e}")
378+
log.error("Error reading metadata file %s: %s", f, e)
379+
continue
405380

406381
# stdout.print(json.dumps(structure, indent=2), markup=True)
407382
jsondir = meta_cache_dir / product
408383
jsondir.mkdir(parents=True, exist_ok=True)
409-
jsonfile = jsondir / f"{docset}.json"
410-
jsonfile.write_text(json.dumps(structure, indent=2))
384+
jsonfile = (
385+
jsondir / f"{docset}.json"
386+
) # e.g., /path/to/cache/product_id/docset_id.json
387+
jsonfile.write_text(manifest.model_dump_json(indent=2, by_alias=True))
411388
log.info(
412389
"Wrote merged metadata JSON for %s/%s => %s", product, docset, jsonfile
413390
)

src/docbuild/models/doctype.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class Doctype(BaseModel):
100100
)
101101

102102
# dunder methods
103-
def __eq__(self, other: object) -> bool:
103+
def __eq__(self: Self, other: object) -> bool:
104104
"""Check equality with another Doctype, ignoring order in docset/langs."""
105105
if not isinstance(other, Doctype):
106106
return NotImplemented
@@ -112,7 +112,7 @@ def __eq__(self, other: object) -> bool:
112112
and set(self.langs) == set(other.langs)
113113
)
114114

115-
def __lt__(self, other: object) -> bool:
115+
def __lt__(self: Self, other: object) -> bool:
116116
"""Check if this Doctype is less than another Doctype."""
117117
if not isinstance(other, Doctype):
118118
return NotImplemented
@@ -125,13 +125,13 @@ def __lt__(self, other: object) -> bool:
125125
self.langs, # we rely on sorted languages
126126
) < (other.product, other.lifecycle, other.docset, other.langs)
127127

128-
def __str__(self) -> str:
128+
def __str__(self: Self) -> str:
129129
"""Implement str(self)."""
130130
langs_str = ",".join(lang.language for lang in self.langs)
131131
docset_str = ",".join(self.docset)
132132
return f"{self.product.value}/{docset_str}@{self.lifecycle.name}/{langs_str}"
133133

134-
def __repr__(self) -> str:
134+
def __repr__(self: Self) -> str:
135135
"""Implement repr(self)."""
136136
langs_str = ",".join(lang.language for lang in self.langs)
137137
docset_str = ",".join(self.docset)
@@ -143,7 +143,7 @@ def __repr__(self) -> str:
143143
f")"
144144
)
145145

146-
def __contains__(self, other: "Doctype") -> bool:
146+
def __contains__(self: Self, other: "Doctype") -> bool:
147147
"""Return if bool(other in self).
148148
149149
Every part of a Doctype is compared element-wise.
@@ -160,7 +160,7 @@ def __contains__(self, other: "Doctype") -> bool:
160160
],
161161
)
162162

163-
def __hash__(self) -> int:
163+
def __hash__(self: Self) -> int:
164164
"""Implement hash(self)."""
165165
return hash(
166166
(
@@ -183,21 +183,9 @@ def coerce_docset(cls, value: str | list[str]) -> list[str]:
183183
"""Convert a string into a list."""
184184
return sorted(value.split(",")) if isinstance(value, str) else sorted(value)
185185

186-
@field_validator("lifecycle", mode="before")
187-
@classmethod
188-
def coerce_lifecycle(cls, value: str | LifecycleFlag) -> LifecycleFlag:
189-
"""Convert a string into a LifecycleFlag."""
190-
# value = "" if value is None else value
191-
if isinstance(value, str):
192-
# Delegate it to the LifecycleFlag to deal with
193-
# the correct parsing and validation
194-
lifecycles = LifecycleFlag.from_str(value)
195-
return lifecycles
196-
return LifecycleFlag(value)
197-
198186
@field_validator("langs", mode="before")
199187
@classmethod
200-
def coerce_langs(cls, value: str | list[str | LanguageCode]) -> list[LanguageCode]:
188+
def coerce_langs(cls: type["Doctype"], value: str | list[str | LanguageCode]) -> list[LanguageCode]:
201189
"""Convert a comma-separated string or a list of strings into LanguageCode."""
202190
# Allow list of strings or Language enums
203191
if isinstance(value, str):
@@ -210,7 +198,7 @@ def coerce_langs(cls, value: str | list[str | LanguageCode]) -> list[LanguageCod
210198
)
211199

212200
@classmethod
213-
def from_str(cls, doctype_str: str) -> Self:
201+
def from_str(cls: type["Doctype"], doctype_str: str) -> Self:
214202
"""Parse a string that adheres to the doctype format."""
215203
match = cls._DOCTYPE_REGEX.match(doctype_str)
216204

@@ -229,7 +217,7 @@ def from_str(cls, doctype_str: str) -> Self:
229217
langs=langs,
230218
)
231219

232-
def xpath(self) -> str:
220+
def xpath(self: Self) -> str:
233221
"""Return an XPath expression for this Doctype to find all deliverables.
234222
235223
>>> result = Doctype.from_str("sles/15-SP6@supported/en-us,de-de").xpath()
@@ -274,3 +262,21 @@ def xpath(self) -> str:
274262
language = f"language[{language}]"
275263

276264
return f"{product}/{docset}/builddocs/{language}"
265+
266+
def product_xpath_segment(self: Self) -> str:
267+
"""Return the XPath segment for the product node.
268+
269+
Example: "product[@productid='sles']" or "product"
270+
"""
271+
if self.product != Product.ALL:
272+
return f"product[@productid={self.product.value!r}]"
273+
return "product"
274+
275+
def docset_xpath_segment(self: Self, docset: str) -> str:
276+
"""Return the XPath segment for the docset node.
277+
278+
Example: "docset[@setid='15-SP6']" or "docset"
279+
"""
280+
if docset != "*":
281+
return f"docset[@setid={docset!r}]"
282+
return "docset"

src/docbuild/models/lifecycle.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,19 @@ class LifecycleFlag(Flag):
2525
unsupported = auto()
2626
"""Unsupported lifecycle state."""
2727

28-
# NOTE: Putting a compiled regex (or other helper) as a class
29-
# variable on an Enum/Flag is error-prone:
30-
# the Enum metaclass treats class attributes specially and may
31-
# convert them into members or otherwise interfere.
32-
#
33-
# Solution: This class variable will be attached after class creation.
34-
# _SEPARATOR = re.compile(r'[|,]') # Static class variable
28+
@classmethod
29+
def _missing_(cls: type[Self], value: object) -> Self | None:
30+
"""Handle missing values by attempting to parse strings."""
31+
if isinstance(value, str):
32+
try:
33+
return cls.from_str(value)
34+
except ValueError:
35+
# Let the default error handling take over for invalid strings
36+
return None
37+
return super()._missing_(value)
3538

3639
@classmethod
37-
def from_str(cls: "LifecycleFlag", value: str) -> "LifecycleFlag":
40+
def from_str(cls: type[Self], value: str) -> Self:
3841
"""Convert a string to a LifecycleFlag object.
3942
4043
The string accepts the values 'supported', 'beta', 'hidden',
@@ -53,7 +56,7 @@ def from_str(cls: "LifecycleFlag", value: str) -> "LifecycleFlag":
5356
<LifecycleFlag.unknown: 0>
5457
5558
"""
56-
separator = cls._SEPARATOR # will exist after we attach it
59+
separator = cls._SEPARATOR # type: ignore[attr-defined]
5760
try:
5861
flag = cls(0) # Start with an empty flag
5962
parts = [v.strip() for v in separator.split(value) if v.strip()]
@@ -92,4 +95,40 @@ def __contains__(self: Self, other: str | Flag) -> bool:
9295

9396

9497
# attach after class creation so EnumMeta doesn't touch it
95-
LifecycleFlag._SEPARATOR: ClassVar[re.Pattern] = re.compile(r"[|,]")
98+
# NOTE: Putting a compiled regex (or other helper) as a class
99+
# variable on an Enum/Flag is error-prone:
100+
# the Enum metaclass treats class attributes specially and may convert them into members.
101+
LifecycleFlag._SEPARATOR: ClassVar[re.Pattern] = re.compile(r"[|,]") # type: ignore[attr-defined]
102+
103+
if __name__ == "__main__": # pragma: nocover
104+
from rich import print # noqa: A004
105+
106+
# Example usage
107+
lc = LifecycleFlag("supported,beta")
108+
print(f"{lc=}")
109+
print(f"{lc.name=}")
110+
print(f"{lc.value=}")
111+
print(f"{lc.supported=}")
112+
print(f"{lc.beta=}")
113+
114+
print("=" * 20)
115+
116+
lc = LifecycleFlag.from_str("supported,beta")
117+
print(f"{lc=}")
118+
print(f"{lc.name=}")
119+
print(f"{lc.value=}")
120+
print(f"{lc.supported=}")
121+
print(f"{lc.beta=}")
122+
123+
print("=" * 20)
124+
125+
lc = LifecycleFlag.from_str("supported|beta")
126+
print(f"{lc=}")
127+
print(f"{lc.name=}")
128+
print(f"{lc.value=}")
129+
print(f"{lc.supported=}")
130+
print(f"{lc.beta=}")
131+
132+
print("=" * 20)
133+
print(f"{dir(lc)=}")
134+
# print(f"{lc.foo=}")

0 commit comments

Comments
 (0)