Skip to content

Commit ce463f7

Browse files
authored
Fix #26: Missing checking <ref/> in XML config (#41)
In former times, the reference checking was done with the `global-check-ref-list.xsl` stylesheet. Although this works, it may be easier and can be customized further to implement it in pure Python code. This change allows to check the stitchfile against any <ref/> which point to a non-existing target. TODO: Make it more async?
1 parent 2cd1421 commit ce463f7

8 files changed

Lines changed: 659 additions & 115 deletions

File tree

changelog.d/41.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix #26 and add missing checks for references in stitchfile

src/docbuild/cli/cmd_validate/process.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
from collections.abc import Iterator
5+
from datetime import date
56
import logging
67
from pathlib import Path
78
import tempfile
@@ -292,7 +293,15 @@ async def process(
292293
if successful_files_paths:
293294
try:
294295
log.info('Performing stitch-file validation...')
295-
await create_stitchfile(successful_files_paths)
296+
cachedir = paths.get('base_server_cache_dir', None)
297+
tree = await create_stitchfile(successful_files_paths)
298+
if cachedir is not None:
299+
stitchfile = (
300+
Path(cachedir) / f'stitchfile-{date.today().isoformat()}.xml'
301+
)
302+
stitchfile.write_text(etree.tostring(tree, encoding='unicode'))
303+
log.debug("Wrote stitchfile to %s", str(stitchfile))
304+
296305
log.info('Stitch-file validation successful.')
297306

298307
except ValueError as e:
@@ -309,7 +318,7 @@ async def process(
309318
failed_part = f'[red]{failed_files} file(s)[/red]'
310319
summary_msg = f'{successful_part} successfully validated, {failed_part} failed.'
311320

312-
if context.verbose > 0:
321+
if context.verbose > 0: # pragma: no cover
313322
console_out.print(f'Result: {summary_msg}')
314323

315324
final_success = (failed_files == 0) and stitch_success

src/docbuild/config/xml/data/global-check-ref-list.xsl

Lines changed: 0 additions & 86 deletions
This file was deleted.
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
"""Check <ref/> links on a stitched XML configuration file."""
2+
3+
from lxml import etree
4+
5+
6+
def check_ref_to_subdeliverable(
7+
ref: etree._Element,
8+
attrs: dict[str, str],
9+
) -> str | None:
10+
"""Check reference to a subdeliverable."""
11+
product = attrs.get('product')
12+
docset = attrs.get('docset')
13+
dc = attrs.get('dc')
14+
subdeliverable = attrs.get('subdeliverable')
15+
# Build the XPath
16+
xpath = ( # Use // to be more robust against nesting
17+
f'//product[@productid = {product!r}]/docset[@setid = {docset!r}]'
18+
f"/builddocs/language[@default = 'true' or @default = 1]"
19+
f"/deliverable[dc = '{dc}'][subdeliverable = '{subdeliverable}']"
20+
)
21+
22+
if not ref.getroottree().xpath(xpath):
23+
origin = ref.xpath(
24+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
25+
)
26+
return (
27+
f'Failed reference from {origin!r} to '
28+
f'{product}/{docset}:{dc}#{subdeliverable}: '
29+
'Referenced subdeliverable does not exist.'
30+
)
31+
32+
33+
def check_ref_to_deliverable(ref: etree._Element, attrs: dict[str, str]) -> str | None:
34+
"""Check reference to a deliverable."""
35+
product = attrs.get('product')
36+
docset = attrs.get('docset')
37+
dc = attrs.get('dc')
38+
39+
# Build the XPath
40+
base_xpath = ( # Use // to be more robust against nesting
41+
f'//product[@productid = {product!r}]/docset[@setid = {docset!r}]'
42+
f"/builddocs/language[@default = 'true' or @default = 1]"
43+
f"/deliverable[dc = '{dc}']"
44+
)
45+
xpath_has_sub = f'{base_xpath}[subdeliverable]'
46+
xpath_hasnot_sub = f'{base_xpath}[not(subdeliverable)]'
47+
tree = ref.getroottree()
48+
49+
origin = ref.xpath(
50+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
51+
)
52+
# A reference to a deliverable is only valid if it exists and
53+
# has no subdeliverables.
54+
if tree.xpath(xpath_hasnot_sub):
55+
return None # Valid reference found
56+
57+
# If we are here, it's an invalid reference. We need to find out why.
58+
if tree.xpath(xpath_has_sub):
59+
return (
60+
f'Failed reference from {origin!r} to {product}/{docset}:{dc}: '
61+
'Referenced deliverable has subdeliverables, '
62+
'you must choose a subdeliverable in your reference.'
63+
)
64+
else:
65+
return (
66+
f'Failed reference from {origin!r} to {product}/{docset}:{dc}: '
67+
'Referenced deliverable does not exist.'
68+
)
69+
70+
71+
def check_ref_to_link(ref: etree._Element, attrs: dict[str, str]) -> str | None:
72+
"""Check ref to an external link."""
73+
product = attrs.get('product')
74+
docset = attrs.get('docset')
75+
link = attrs.get('link')
76+
77+
xpath = ( # Use // to be more robust against nesting
78+
f'//product[@productid = {product!r}]/docset[@setid = {docset!r}]'
79+
f"/builddocs/language[@default = 'true' or @default = 1]"
80+
f"/external/link[@linkid = '{link}']"
81+
)
82+
83+
tree = ref.getroottree()
84+
if not tree.xpath(xpath):
85+
origin = ref.xpath(
86+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
87+
)
88+
return (
89+
f'Failed reference from {origin!r} to {product}/{docset}@{link}: '
90+
'Referenced external link does not exist.'
91+
)
92+
93+
94+
def check_ref_to_docset(ref: etree._Element, attrs: dict[str, str]) -> str | None:
95+
"""Check reference to a docset."""
96+
product = attrs.get('product')
97+
docset = attrs.get('docset')
98+
xpath = ( # Use // to be more robust against nesting
99+
f'//product[@productid = {product!r}]/docset[@setid = {docset!r}]'
100+
)
101+
102+
tree = ref.getroottree()
103+
if not tree.xpath(xpath):
104+
origin = ref.xpath(
105+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
106+
)
107+
return (
108+
f'Failed reference from {origin!r} to {product}/{docset}: '
109+
'Referenced docset does not exist.'
110+
)
111+
112+
113+
def check_ref_to_product(ref: etree._Element, attrs: dict[str, str]) -> str | None:
114+
"""Check reference to a product."""
115+
product = attrs.get('product')
116+
117+
xpath = f'//product[@productid = {product!r}]' # Use // to be more robust
118+
tree = ref.getroottree()
119+
if not tree.xpath(xpath):
120+
origin = ref.xpath(
121+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
122+
)
123+
return (
124+
f'Failed reference from {origin!r} to {product}: '
125+
'Referenced product does not exist.'
126+
)
127+
128+
129+
def check_stitched_references(tree: etree._ElementTree) -> list[str]:
130+
"""Check <ref> elements for broken references.
131+
132+
This function is a Python equivalent of the
133+
``global-check-ref-list.xsl`` stylesheet.
134+
135+
:param tree: The tree of the stitched XML file.
136+
:returns: A list of error messages for any broken references found.
137+
"""
138+
errors: list[str] = []
139+
140+
# TODO: Make it concurrent using Python's `concurrent.futures` module.
141+
# Consider using ThreadPoolExecutor or ProcessPoolExecutor to parallelize
142+
# the processing of <ref> elements. Ensure thread safety when appending
143+
# to the `errors` list, and handle exceptions raised during concurrent
144+
# execution to avoid losing error messages.
145+
for ref in tree.iter('ref'):
146+
attrs = ref.attrib
147+
product = attrs.get('product')
148+
docset = attrs.get('docset')
149+
dc = attrs.get('dc')
150+
subdeliverable = attrs.get('subdeliverable')
151+
link = attrs.get('link')
152+
153+
result = None
154+
# Case 1: Reference to a subdeliverable
155+
if subdeliverable and dc and docset and product:
156+
result = check_ref_to_subdeliverable(ref, attrs)
157+
158+
# Case 2: Reference to a deliverable
159+
elif dc and docset and product:
160+
result = check_ref_to_deliverable(ref, attrs)
161+
162+
# Case 3: Reference to an external link
163+
elif link and docset and product:
164+
result = check_ref_to_link(ref, attrs)
165+
166+
# Case 4: Reference to a docset
167+
elif docset and product:
168+
result = check_ref_to_docset(ref, attrs)
169+
170+
# Case 5: Reference to a product
171+
elif product:
172+
result = check_ref_to_product(ref, attrs)
173+
174+
# Case 6: Invalid <ref> element
175+
else:
176+
origin = ref.xpath(
177+
'concat(ancestor::product/@productid, "/", ancestor::docset/@setid)'
178+
)
179+
result = (
180+
f'Reference failed in {origin!r}. '
181+
'This issue should have been caught by the RNC validation: '
182+
'This is a docserv-stitch bug.'
183+
)
184+
185+
if result:
186+
errors.append(result)
187+
188+
return errors

src/docbuild/config/xml/stitch.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
from copy import deepcopy
77
import importlib
88
import inspect
9+
import logging
910
from pathlib import Path
1011

1112
from lxml import etree
1213

1314
from ...constants import XMLDATADIR
15+
from .references import check_stitched_references
16+
17+
log = logging.getLogger(__name__)
1418

1519

1620
def load_check_functions() -> list[Callable]:
@@ -30,21 +34,36 @@ def load_check_functions() -> list[Callable]:
3034
# print(f'Memory usage: {process.memory_info().rss / 1024**2:.2f} MB')
3135

3236

37+
def check_stitchfile(tree: etree._Element | etree._ElementTree) -> bool:
38+
"""Check the stitchfile for unresolved references.
39+
40+
:param tree: The tree of the stitched XML file.
41+
:returns: True if no unresolved references are found, False otherwise.
42+
"""
43+
result = check_stitched_references(tree)
44+
45+
for err in result:
46+
log.error(err)
47+
48+
return not bool(result)
49+
50+
3351
async def create_stitchfile(
3452
xmlfiles: Sequence[str | Path],
3553
*,
3654
xmlparser: etree.XMLParser | None = None,
55+
with_ref_check: bool = True,
3756
) -> etree._ElementTree:
3857
"""Asynchronously create a stitch file from a list of XML files.
3958
4059
:param xmlfiles: A sequence of XML file paths to stitch together.
4160
:param xmlparser: An optional XML parser to use.
61+
:param with_ref_check: Whether to perform a reference check (=True) or not (=False).
4262
:return: all XML file stitched together into a
4363
:class:`lxml.etree.ElementTree` object.
4464
"""
4565
# TODO: Should we retrieve them from the config file?
46-
simplify_xslt = XMLDATADIR / 'simplify.xsl'
47-
reference_xslt = XMLDATADIR / 'global-check-ref-list.xsl'
66+
# simplify_xslt = XMLDATADIR / 'simplify.xsl'
4867

4968
# TODO: Do we need the MD5 hashes? If so maybe store them
5069

@@ -74,5 +93,9 @@ async def parse_and_xinclude(file_path: Path) -> etree._ElementTree:
7493
raise ValueError(f'Duplicate product IDs found: {", ".join(duplicates)}')
7594

7695
# Step 3: Check references
96+
if with_ref_check:
97+
result = check_stitchfile(docservconfig)
98+
if not result:
99+
raise ValueError('Unresolved references found in stitch file')
77100

78101
return etree.ElementTree(docservconfig)

0 commit comments

Comments
 (0)