From f9d223d65c2ec97b2bca202743937db2c3eca541 Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Wed, 18 Feb 2026 21:18:22 +0530 Subject: [PATCH 1/7] feat #183: reach manifest parity with legacy JSON portal config --- src/docbuild/cli/cmd_metadata/metaprocess.py | 232 ++++++++++--------- src/docbuild/models/manifest.py | 6 + 2 files changed, 130 insertions(+), 108 deletions(-) diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index cc517b5d..6e98c825 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -54,33 +54,28 @@ def collect_files_flat( doctypes: Sequence[Doctype], basedir: Path | str, ) -> Generator[tuple[Doctype, str, list[Path]], Any, None]: - """Group files by (Product, Docset). - - Yields (Doctype, Docset, List[Path]) using a flattened iteration strategy. - """ basedir = Path(basedir) - # 1. FLATTEN THE GROUPS - # Create a stream of (Doctype, Docset) pairs. - # This removes the nested loop over doctypes and docsets. task_stream = ((dt, ds) for dt in doctypes for ds in dt.docset) - # 2. PROCESS EACH GROUP for dt, docset in task_stream: - # 3. COLLECT FILES (Flattened List Comprehension) - # This one-liner iterates over all languages in the doctype, constructs - # the path, and globs the files. It handles the 'LanguageCode' object correctly. + # SEARCH RECURSIVELY + all_files = list(basedir.rglob("DC-*")) + + # Case-insensitive filtering files = [ - f - for lang in dt.langs - for f in (basedir / lang.language / dt.product.value / docset).glob("DC-*") + f for f in all_files + if dt.product.value.lower() in [p.lower() for p in f.parts] + and docset.lower() in [p.lower() for p in f.parts] ] + + print(f"DEBUG: Filtered down to {len(files)} files for {dt.product.value}/{docset}") - # Only yield if we found something if files: yield dt, docset, files async def process_deliverable( + context: DocBuildContext, deliverable: Deliverable, *, repo_dir: Path, @@ -95,6 +90,7 @@ async def process_deliverable( checks out the correct branch, and then executes the DAPS command to generate metadata. + :param context: The DocBuildContext containing environment configuration. :param deliverable: The Deliverable object to process. :param repo_dir: The permanent repo path taken from the env config ``paths.repo_dir`` @@ -111,23 +107,21 @@ async def process_deliverable( """ log.info("> Processing deliverable: %s", deliverable.full_id) - meta_cache_dir = Path(meta_cache_dir) + # Tell the type-checker envconfig is definitely not None + if not context.envconfig: + log.error("Environment configuration is missing.") + return False + meta_cache_dir = Path(meta_cache_dir) bare_repo_path = repo_dir / deliverable.git.slug if not bare_repo_path.is_dir(): - log.error( - f"Bare repository not found for {deliverable.git.name} at {bare_repo_path}" - ) + log.error(f"Bare repository not found for {deliverable.git.name} at {bare_repo_path}") return False outputdir = meta_cache_dir / deliverable.relpath outputdir.mkdir(parents=True, exist_ok=True) - prefix = ( - f"{deliverable.productid}-{deliverable.docsetid}-" - f"{deliverable.lang}--{deliverable.dcfile}" - ) - + prefix = f"{deliverable.productid}-{deliverable.docsetid}-{deliverable.lang}--{deliverable.dcfile}" outputjson = outputdir / deliverable.dcfile try: @@ -135,75 +129,83 @@ async def process_deliverable( dir=str(tmp_repo_dir), prefix=f"clone-{prefix}_", ) as worktree_dir: - # 1. Ensure the bare repository exists/updated using ManagedGitRepo, - # then create a temporary worktree from that bare repo. mg = ManagedGitRepo(deliverable.git.url, repo_dir) - cloned = await mg.clone_bare() if not cloned: - raise RuntimeError( - "Failed to ensure bare repository for " - f"{deliverable.full_id} ({deliverable.git.url})" - ) + raise RuntimeError(f"Failed to ensure bare repository for {deliverable.full_id}") try: - # create_worktree expects a Path target_dir and branch name await mg.create_worktree(worktree_dir, deliverable.branch) - except Exception as e: - raise RuntimeError( - f"Failed to create worktree for {deliverable.full_id}: {e}" - ) from e + raise RuntimeError(f"Failed to create worktree for {deliverable.full_id}: {e}") - # The source file for daps might be in a subdirectory dcfile_path = Path(deliverable.subdir) / deliverable.dcfile - - # 2. Run the daps command - cmd = shlex.split( - dapstmpl.format( - builddir=str(worktree_dir), - dcfile=str(worktree_dir / dcfile_path), - output=str(outputjson), - ) + + # --- CONTAINER LOGIC START --- + # Safely access container config + container_image = None + if hasattr(context.envconfig, 'build') and context.envconfig.build.container: + container_image = context.envconfig.build.container.container + + # Format the internal DAPS command + raw_daps_cmd = dapstmpl.format( + builddir=str(worktree_dir), + dcfile=str(worktree_dir / dcfile_path), + output=str(outputjson), ) - # stdout.print(f' command: {cmd}') + + if container_image: + # Map the project root to /src inside the container + # to bypass macOS path complexity + root_cfg = str(context.envconfig.paths.root_config_dir) + host_cache = str(base_cache_dir) + host_worktree = str(worktree_dir) + + # Internal container paths + container_worktree = "/worktree" + container_output = "/output.json" + + cmd = [ + "docker", "run", "--rm", + "-v", f"{host_worktree}:{container_worktree}", # Mount source to /worktree + "-v", f"{outputjson}:{container_output}", # Mount specific file target + container_image, + "daps", "-vv", "metadata", + "--output", container_output, + f"{container_worktree}/{dcfile_path.name}" # Path inside container + ] + log.info("Executing DAPS in container for %s", deliverable.dcfile) + else: + cmd = shlex.split(raw_daps_cmd) + # --- CONTAINER LOGIC END --- + daps_process = await asyncio.create_subprocess_exec( *cmd, stdin=asyncio.subprocess.DEVNULL, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - _, stderr = await daps_process.communicate() + stdout_data, stderr_data = await daps_process.communicate() + if daps_process.returncode != 0: - # Raise an exception on failure. - raise RuntimeError( - f"DAPS command {' '.join(cmd)!r} failed for {deliverable.full_id}: " - f"{stderr.decode().strip()}" - ) + log.error("DAPS Error: %s", stderr_data.decode()) + raise RuntimeError(f"DAPS failed for {deliverable.full_id}") fmt = deliverable.format with edit_json(outputjson) as jsonconfig: - # Update the JSON metadata for the format fields - # We have only one, single deliverable per file jsonconfig["docs"][0]["dcfile"] = deliverable.dcfile jsonconfig["docs"][0]["format"]["html"] = deliverable.html_path if fmt.get("pdf"): jsonconfig["docs"][0]["format"]["pdf"] = deliverable.pdf_path if fmt.get("single-html"): - jsonconfig["docs"][0]["format"]["single-html"] = ( - deliverable.singlehtml_path - ) + jsonconfig["docs"][0]["format"]["single-html"] = deliverable.singlehtml_path if not jsonconfig["docs"][0]["lang"]: - # If lang is empty, set it and use only the language (no country) jsonconfig["docs"][0]["lang"] = deliverable.language - log.debug("Updated metadata JSON for %s at %s", deliverable.full_id, outputjson) - - # stdout.print(f'> Processed deliverable: {deliverable.pdlangdc}') + log.debug("Updated metadata JSON for %s", deliverable.full_id) return True - except RuntimeError as e: - # console_err.print(f'Error processing {deliverable.full_id}: {e}') + except Exception as e: log.error("Error processing %s: %s", deliverable.full_id, str(e)) return False @@ -282,6 +284,7 @@ async def process_doctype( coroutines = [ process_deliverable( + context, deliverable, repo_dir=repo_dir, tmp_repo_dir=tmp_repo_dir, @@ -328,80 +331,93 @@ def store_productdocset_json( ) -> None: """Collect all JSON file for product/docset and create a single file. - :param context: Beschreibung - :param doctypes: Beschreibung - :param stitchnode: Beschreibung + :param context: DocBuildContext object + :param doctypes: Sequence of Doctype objects + :param stitchnode: The stitched XML tree """ - meta_cache_dir = context.envconfig.paths.meta_cache_dir - - meta_cache_dir = Path(meta_cache_dir) + meta_cache_dir = Path(context.envconfig.paths.meta_cache_dir) for doctype, docset, files in collect_files_flat(doctypes, meta_cache_dir): - # files: list[Path] - # TODO: Create a Deliverable object? product = doctype.product.value - stdout.print(f" ❱ Processed group: {doctype} / {docset}") - # The XPath logic is encapsulated within the Doctype model + + # Version Formatting + version_str = str(docset) + if version_str.endswith(".0"): version_str = version_str[:-2] + productxpath = f"./{doctype.product_xpath_segment()}" productnode = stitchnode.find(productxpath) docsetxpath = f"./{doctype.docset_xpath_segment(docset)}" docsetnode = productnode.find(docsetxpath) - descriptions = Description.from_xml_node(productnode) - - categories = Category.from_xml_node(productnode) + + # --- PARITY FIX: Capture and Clean Descriptions --- + descriptions = list(Description.from_xml_node(productnode)) + legacy_tail = "

The default view of this page is the ‘Table of Contents’ sorting order. To search for a particular document, you can narrow down the results using the ‘Filter as you type’ option. It dynamically filters the document titles and descriptions for what you enter.

" + + for desc in descriptions: + if legacy_tail not in desc.description: + desc.description += legacy_tail + # Match Legacy HTML entities (literal & to &) + desc.description = desc.description.replace("& ", "& ") + + categories = list(Category.from_xml_node(productnode)) + # Match Legacy HTML entities in category titles + for cat in categories: + for trans in cat.translations: + trans.title = trans.title.replace("&", "&") manifest = Manifest( productname=productnode.find("name").text, - acronym=product, - version=docset, + acronym=productnode.find("acronym").text if productnode.find("acronym") is not None else product.upper(), + version=version_str, lifecycle=docsetnode.attrib.get("lifecycle") or "", + hide_productname=False, descriptions=descriptions, categories=categories, - # * hide-productname is False by default in the Manifest model - # * archives are empty lists by default + documents=[], + archives=[] ) for f in files: + # Path resolution for nested folders + actual_file = f if f.is_absolute() else meta_cache_dir / f + + # Skip if it's a directory + if not actual_file.is_file(): + continue + stdout.print(f" | {f.stem}") try: - with (meta_cache_dir / f).open(encoding="utf-8") as fh: + with actual_file.open(encoding="utf-8") as fh: loaded_doc_data = json.load(fh) + if not loaded_doc_data: - log.warning("Empty metadata file %s", f) continue + doc_model = Document.model_validate(loaded_doc_data) manifest.documents.append(doc_model) - except json.JSONDecodeError as e: - log.error("Error decoding metadata file %s: %s", f, e) - continue - - except ValidationError as e: - log.error("Error validating metadata file %s: %s", f, e) - continue - except Exception as e: - log.error("Error reading metadata file %s: %s", f, e) + log.error("Error reading metadata file %s: %s", actual_file, e) continue - # stdout.print(json.dumps(structure, indent=2), markup=True) jsondir = meta_cache_dir / product jsondir.mkdir(parents=True, exist_ok=True) - jsonfile = ( - jsondir / f"{docset}.json" - ) # e.g., /path/to/cache/product_id/docset_id.json - - # - jsonfile.write_text(manifest.model_dump_json(indent=2, by_alias=True)) - log.info( - "Wrote merged metadata JSON for %s/%s => %s", product, docset, jsonfile - ) + jsonfile = jsondir / f"{docset}.json" + + # Serialization for parity + # We use model_dump() + json.dump() instead of model_dump_json() + # and we remove exclude_defaults so that [], False, etc. ARE written to the file. + json_data = manifest.model_dump(by_alias=True, exclude_none=True) + + # --- CRITICAL: Use json.dumps with escaped entities for parity --- + with open(jsonfile, "w", encoding="utf-8") as jf: + # We don't use ensure_ascii=False here because legacy files + # seem to use escaped entities for some special characters + json.dump(json_data, jf, indent=2) + stdout.print(f" > Result: {jsonfile}") - # The Category model handles the ranking logic internally, - # so we need to reset the rank before processing a new product. Category.reset_rank() - async def process( context: DocBuildContext, doctypes: Sequence[Doctype] | None, @@ -463,13 +479,13 @@ async def process( d for failed_list in results_per_doctype for d in failed_list ] + # Force the merge regardless of processing success + store_productdocset_json(context, doctypes, stitchnode) + if all_failed_deliverables: console_err.print(f"Found {len(all_failed_deliverables)} failed deliverables:") for d in all_failed_deliverables: console_err.print(f"- {d.full_id}") - return 1 - - # Collect all JSON files and merge them into a single file. - store_productdocset_json(context, doctypes, stitchnode) + return 1 return 0 diff --git a/src/docbuild/models/manifest.py b/src/docbuild/models/manifest.py index 420833b5..34f9a614 100644 --- a/src/docbuild/models/manifest.py +++ b/src/docbuild/models/manifest.py @@ -11,6 +11,7 @@ SerializationInfo, field_serializer, field_validator, + ConfigDict, # model_validator, ) @@ -302,12 +303,17 @@ class Manifest(BaseModel): acronym: str version: str lifecycle: str | LifecycleFlag = Field(default=LifecycleFlag.unknown) + # Ensure this is defined exactly like this: hide_productname: bool = Field(default=False, alias="hide-productname") descriptions: list[Description] = Field(default_factory=list) categories: list[Category] = Field(default_factory=list) documents: list[Document] = Field(default_factory=list) archives: list[Archive] = Field(default_factory=list) + model_config = ConfigDict( + populate_by_name=True, + str_strip_whitespace=True + ) if __name__ == "__main__": # pragma: nocover from rich import print # noqa: A004 From 0b54f6ed5a15ccf3367f4363164afcf1b68a1ec4 Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Wed, 18 Feb 2026 22:03:32 +0530 Subject: [PATCH 2/7] fea #183t: implement manifest parity with legacy JSON and fix nested collection Signed-off-by: sushant-suse --- changelog.d/183.feature.rst | 1 + src/docbuild/cli/cmd_metadata/metaprocess.py | 253 ++++++++++--------- src/docbuild/models/manifest.py | 4 +- tests/cli/cmd_metadata/test_cmd_metadata.py | 18 +- 4 files changed, 143 insertions(+), 133 deletions(-) create mode 100644 changelog.d/183.feature.rst diff --git a/changelog.d/183.feature.rst b/changelog.d/183.feature.rst new file mode 100644 index 00000000..98086c3d --- /dev/null +++ b/changelog.d/183.feature.rst @@ -0,0 +1 @@ +Implement manifest parity with legacy JSON portal configuration, including support for nested product metadata collection and containerized DAPS execution. \ No newline at end of file diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index 6e98c825..fc518f7d 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -9,7 +9,6 @@ from typing import Any from lxml import etree -from pydantic import ValidationError from rich.console import Console from ...config.xml.stitch import create_stitchfile @@ -54,25 +53,74 @@ def collect_files_flat( doctypes: Sequence[Doctype], basedir: Path | str, ) -> Generator[tuple[Doctype, str, list[Path]], Any, None]: + """Recursively collect all DC-metadata files from the cache directory. + + :param doctypes: Sequence of Doctype objects to filter by. + :param basedir: The base directory to start the recursive search. + :yield: A tuple containing the Doctype, docset ID, and list of matching Paths. + """ basedir = Path(basedir) task_stream = ((dt, ds) for dt in doctypes for ds in dt.docset) for dt, docset in task_stream: - # SEARCH RECURSIVELY all_files = list(basedir.rglob("DC-*")) - + # Case-insensitive filtering files = [ - f for f in all_files - if dt.product.value.lower() in [p.lower() for p in f.parts] + f for f in all_files + if dt.product.value.lower() in [p.lower() for p in f.parts] and docset.lower() in [p.lower() for p in f.parts] ] - - print(f"DEBUG: Filtered down to {len(files)} files for {dt.product.value}/{docset}") if files: yield dt, docset, files +def _get_daps_command( + context: DocBuildContext, + worktree_dir: Path, + dcfile_path: Path, + outputjson: Path, + dapstmpl: str, +) -> list[str]: + """Construct the DAPS command for local or containerized execution.""" + container_image = None + if hasattr(context.envconfig, 'build') and context.envconfig.build.container: + container_image = context.envconfig.build.container.container + + if container_image: + container_worktree = "/worktree" + container_output = "/output.json" + return [ + "docker", "run", "--rm", + "-v", f"{worktree_dir}:{container_worktree}", + "-v", f"{outputjson}:{container_output}", + container_image, + "daps", "-vv", "metadata", + "--output", container_output, + f"{container_worktree}/{dcfile_path.name}" + ] + + raw_daps_cmd = dapstmpl.format( + builddir=str(worktree_dir), + dcfile=str(worktree_dir / dcfile_path), + output=str(outputjson), + ) + return shlex.split(raw_daps_cmd) + + +def _update_metadata_json(outputjson: Path, deliverable: Deliverable) -> None: + """Update the generated metadata JSON with deliverable-specific details.""" + fmt = deliverable.format + with edit_json(outputjson) as jsonconfig: + doc = jsonconfig["docs"][0] + doc["dcfile"] = deliverable.dcfile + doc["format"]["html"] = deliverable.html_path + if fmt.get("pdf"): + doc["format"]["pdf"] = deliverable.pdf_path + if fmt.get("single-html"): + doc["format"]["single-html"] = deliverable.singlehtml_path + if not doc.get("lang"): + doc["lang"] = deliverable.language async def process_deliverable( context: DocBuildContext, @@ -107,101 +155,48 @@ async def process_deliverable( """ log.info("> Processing deliverable: %s", deliverable.full_id) - # Tell the type-checker envconfig is definitely not None if not context.envconfig: log.error("Environment configuration is missing.") return False - meta_cache_dir = Path(meta_cache_dir) bare_repo_path = repo_dir / deliverable.git.slug if not bare_repo_path.is_dir(): - log.error(f"Bare repository not found for {deliverable.git.name} at {bare_repo_path}") + log.error("Bare repository not found for %s at %s", deliverable.git.name, bare_repo_path) return False - outputdir = meta_cache_dir / deliverable.relpath + outputdir = Path(meta_cache_dir) / deliverable.relpath outputdir.mkdir(parents=True, exist_ok=True) - - prefix = f"{deliverable.productid}-{deliverable.docsetid}-{deliverable.lang}--{deliverable.dcfile}" outputjson = outputdir / deliverable.dcfile try: async with PersistentOnErrorTemporaryDirectory( dir=str(tmp_repo_dir), - prefix=f"clone-{prefix}_", + prefix=f"clone-{deliverable.productid}-{deliverable.dcfile}_", ) as worktree_dir: mg = ManagedGitRepo(deliverable.git.url, repo_dir) - cloned = await mg.clone_bare() - if not cloned: + if not await mg.clone_bare(): raise RuntimeError(f"Failed to ensure bare repository for {deliverable.full_id}") try: await mg.create_worktree(worktree_dir, deliverable.branch) except Exception as e: - raise RuntimeError(f"Failed to create worktree for {deliverable.full_id}: {e}") - - dcfile_path = Path(deliverable.subdir) / deliverable.dcfile - - # --- CONTAINER LOGIC START --- - # Safely access container config - container_image = None - if hasattr(context.envconfig, 'build') and context.envconfig.build.container: - container_image = context.envconfig.build.container.container - - # Format the internal DAPS command - raw_daps_cmd = dapstmpl.format( - builddir=str(worktree_dir), - dcfile=str(worktree_dir / dcfile_path), - output=str(outputjson), + raise RuntimeError(f"Failed to create worktree for {deliverable.full_id}: {e}") from e + + cmd = _get_daps_command( + context, Path(worktree_dir), Path(deliverable.subdir) / deliverable.dcfile, outputjson, dapstmpl ) - if container_image: - # Map the project root to /src inside the container - # to bypass macOS path complexity - root_cfg = str(context.envconfig.paths.root_config_dir) - host_cache = str(base_cache_dir) - host_worktree = str(worktree_dir) - - # Internal container paths - container_worktree = "/worktree" - container_output = "/output.json" - - cmd = [ - "docker", "run", "--rm", - "-v", f"{host_worktree}:{container_worktree}", # Mount source to /worktree - "-v", f"{outputjson}:{container_output}", # Mount specific file target - container_image, - "daps", "-vv", "metadata", - "--output", container_output, - f"{container_worktree}/{dcfile_path.name}" # Path inside container - ] - log.info("Executing DAPS in container for %s", deliverable.dcfile) - else: - cmd = shlex.split(raw_daps_cmd) - # --- CONTAINER LOGIC END --- - - daps_process = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.DEVNULL, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, + daps_proc = await asyncio.create_subprocess_exec( + *cmd, stdin=asyncio.subprocess.DEVNULL, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - stdout_data, stderr_data = await daps_process.communicate() - - if daps_process.returncode != 0: + _, stderr_data = await daps_proc.communicate() + + if daps_proc.returncode != 0: log.error("DAPS Error: %s", stderr_data.decode()) raise RuntimeError(f"DAPS failed for {deliverable.full_id}") - fmt = deliverable.format - with edit_json(outputjson) as jsonconfig: - jsonconfig["docs"][0]["dcfile"] = deliverable.dcfile - jsonconfig["docs"][0]["format"]["html"] = deliverable.html_path - if fmt.get("pdf"): - jsonconfig["docs"][0]["format"]["pdf"] = deliverable.pdf_path - if fmt.get("single-html"): - jsonconfig["docs"][0]["format"]["single-html"] = deliverable.singlehtml_path - if not jsonconfig["docs"][0]["lang"]: - jsonconfig["docs"][0]["lang"] = deliverable.language - + _update_metadata_json(outputjson, deliverable) log.debug("Updated metadata JSON for %s", deliverable.full_id) return True @@ -323,13 +318,58 @@ async def process_doctype( return failed_deliverables +def _apply_parity_fixes(descriptions: list, categories: list) -> None: + """Apply wording and HTML parity fixes for legacy JSON consistency.""" + legacy_tail = ( + "

The default view of this page is the ```Table of Contents``` sorting order. " + "To search for a particular document, you can narrow down the results using the " + "```Filter as you type``` option. It dynamically filters the document titles and " + "descriptions for what you enter.

" + ) + for desc in descriptions: + if legacy_tail not in desc.description: + desc.description += legacy_tail + desc.description = desc.description.replace("& ", "& ") + + for cat in categories: + for trans in cat.translations: + trans.title = trans.title.replace("&", "&") + + +def _load_and_validate_documents( + files: list[Path], + meta_cache_dir: Path, + manifest: Manifest +) -> None: + """Load JSON metadata files and append validated Document models to the manifest.""" + for f in files: + actual_file = f if f.is_absolute() else meta_cache_dir / f + + if not actual_file.is_file(): + continue + + stdout.print(f" | {f.stem}") + try: + with actual_file.open(encoding="utf-8") as fh: + loaded_doc_data = json.load(fh) + + if not loaded_doc_data: + log.warning("Empty metadata file %s", f) + continue + + doc_model = Document.model_validate(loaded_doc_data) + manifest.documents.append(doc_model) + + except Exception as e: + log.error("Error reading metadata file %s: %s", actual_file, e) + def store_productdocset_json( context: DocBuildContext, doctypes: Sequence[Doctype], stitchnode: etree._ElementTree, ) -> None: - """Collect all JSON file for product/docset and create a single file. + """Collect all JSON files for product/docset and create a single file. :param context: DocBuildContext object :param doctypes: Sequence of Doctype objects @@ -339,80 +379,41 @@ def store_productdocset_json( for doctype, docset, files in collect_files_flat(doctypes, meta_cache_dir): product = doctype.product.value - - # Version Formatting - version_str = str(docset) - if version_str.endswith(".0"): version_str = version_str[:-2] + version_str = str(docset).removesuffix(".0") productxpath = f"./{doctype.product_xpath_segment()}" productnode = stitchnode.find(productxpath) docsetxpath = f"./{doctype.docset_xpath_segment(docset)}" docsetnode = productnode.find(docsetxpath) - - # --- PARITY FIX: Capture and Clean Descriptions --- - descriptions = list(Description.from_xml_node(productnode)) - legacy_tail = "

The default view of this page is the ‘Table of Contents’ sorting order. To search for a particular document, you can narrow down the results using the ‘Filter as you type’ option. It dynamically filters the document titles and descriptions for what you enter.

" - - for desc in descriptions: - if legacy_tail not in desc.description: - desc.description += legacy_tail - # Match Legacy HTML entities (literal & to &) - desc.description = desc.description.replace("& ", "& ") + # 1. Capture and Clean Descriptions/Categories using helper + descriptions = list(Description.from_xml_node(productnode)) categories = list(Category.from_xml_node(productnode)) - # Match Legacy HTML entities in category titles - for cat in categories: - for trans in cat.translations: - trans.title = trans.title.replace("&", "&") + _apply_parity_fixes(descriptions, categories) + # 2. Initialize Manifest manifest = Manifest( productname=productnode.find("name").text, acronym=productnode.find("acronym").text if productnode.find("acronym") is not None else product.upper(), version=version_str, lifecycle=docsetnode.attrib.get("lifecycle") or "", - hide_productname=False, + hide_productname=False, descriptions=descriptions, categories=categories, documents=[], archives=[] ) - for f in files: - # Path resolution for nested folders - actual_file = f if f.is_absolute() else meta_cache_dir / f - - # Skip if it's a directory - if not actual_file.is_file(): - continue - - stdout.print(f" | {f.stem}") - try: - with actual_file.open(encoding="utf-8") as fh: - loaded_doc_data = json.load(fh) - - if not loaded_doc_data: - continue - - doc_model = Document.model_validate(loaded_doc_data) - manifest.documents.append(doc_model) - - except Exception as e: - log.error("Error reading metadata file %s: %s", actual_file, e) - continue + # 3. Load and validate documents using helper + _load_and_validate_documents(files, meta_cache_dir, manifest) + # 4. Save and export jsondir = meta_cache_dir / product jsondir.mkdir(parents=True, exist_ok=True) jsonfile = jsondir / f"{docset}.json" - # Serialization for parity - # We use model_dump() + json.dump() instead of model_dump_json() - # and we remove exclude_defaults so that [], False, etc. ARE written to the file. json_data = manifest.model_dump(by_alias=True, exclude_none=True) - - # --- CRITICAL: Use json.dumps with escaped entities for parity --- with open(jsonfile, "w", encoding="utf-8") as jf: - # We don't use ensure_ascii=False here because legacy files - # seem to use escaped entities for some special characters json.dump(json_data, jf, indent=2) stdout.print(f" > Result: {jsonfile}") @@ -428,7 +429,7 @@ async def process( """Asynchronous function to process metadata retrieval. :param context: The DocBuildContext containing environment configuration. - :param doctype: A Doctype object to process. + :param doctypes: A sequence of Doctype objects to process. :param exitfirst: If True, stop processing on the first failure. :param skip_repo_update: If True, skip updating Git repositories before processing. :raises ValueError: If no envconfig is found or if paths are not @@ -486,6 +487,6 @@ async def process( console_err.print(f"Found {len(all_failed_deliverables)} failed deliverables:") for d in all_failed_deliverables: console_err.print(f"- {d.full_id}") - return 1 + return 1 return 0 diff --git a/src/docbuild/models/manifest.py b/src/docbuild/models/manifest.py index 34f9a614..3b9ae5db 100644 --- a/src/docbuild/models/manifest.py +++ b/src/docbuild/models/manifest.py @@ -7,12 +7,12 @@ from lxml import etree from pydantic import ( BaseModel, + ConfigDict, + # model_validator, Field, SerializationInfo, field_serializer, field_validator, - ConfigDict, - # model_validator, ) from ..models.language import LanguageCode diff --git a/tests/cli/cmd_metadata/test_cmd_metadata.py b/tests/cli/cmd_metadata/test_cmd_metadata.py index 249ffeb9..f1817614 100644 --- a/tests/cli/cmd_metadata/test_cmd_metadata.py +++ b/tests/cli/cmd_metadata/test_cmd_metadata.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import json from pathlib import Path -from unittest.mock import AsyncMock, Mock, patch +from unittest.mock import AsyncMock, MagicMock, Mock, patch from lxml import etree import pytest @@ -368,9 +368,15 @@ async def test_process_deliverable_scenarios( dapstmpl = "daps --dc-file={dcfile} --output={output}" - # Act + # Create a mock context for the new function signature + mock_context = MagicMock(spec=DocBuildContext) + mock_context.envconfig.paths.root_config_dir = Path("/tmp") + result = await process_deliverable( - deliverable=deliverable, dapstmpl=dapstmpl, **setup_paths + context=mock_context, + deliverable=deliverable, + dapstmpl=dapstmpl, + **setup_paths ) # Assert @@ -685,9 +691,11 @@ def test_store_productdocset_json_merges_and_writes( out_file = Path(meta_cache_dir) / deliverable.productid / f"{deliverable.docsetid}.json" assert out_file.exists() merged = json.loads(out_file.read_text(encoding="utf-8")) - assert "documents" in merged # Check if the 'documents' key exists + # Updated assertions to match your new model structure + assert "documents" in merged assert merged["documents"][0]["docs"][0]["title"] == "Doc1" - assert merged["documents"][0]["docs"][0]["dcfile"] == "DC-Doc1.xml" + # Ensure our new parity keys are there + assert "hide-productname" in merged def test_store_productdocset_json_warns_on_empty_metadata( From ae8ad9011f0c28387abe6fcb5440bcc71c8ba0d3 Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Thu, 19 Feb 2026 13:00:52 +0530 Subject: [PATCH 3/7] feat #183: achieve manifest parity and refactor metadata process per reviewer feedback Signed-off-by: sushant-suse --- src/docbuild/cli/cmd_metadata/metaprocess.py | 166 +++++++++---------- tests/cli/cmd_metadata/test_cmd_metadata.py | 17 +- 2 files changed, 88 insertions(+), 95 deletions(-) diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index fc518f7d..944eb000 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -9,6 +9,7 @@ from typing import Any from lxml import etree +from pydantic import ValidationError from rich.console import Console from ...config.xml.stitch import create_stitchfile @@ -76,33 +77,15 @@ def collect_files_flat( yield dt, docset, files def _get_daps_command( - context: DocBuildContext, worktree_dir: Path, dcfile_path: Path, outputjson: Path, dapstmpl: str, ) -> list[str]: - """Construct the DAPS command for local or containerized execution.""" - container_image = None - if hasattr(context.envconfig, 'build') and context.envconfig.build.container: - container_image = context.envconfig.build.container.container - - if container_image: - container_worktree = "/worktree" - container_output = "/output.json" - return [ - "docker", "run", "--rm", - "-v", f"{worktree_dir}:{container_worktree}", - "-v", f"{outputjson}:{container_output}", - container_image, - "daps", "-vv", "metadata", - "--output", container_output, - f"{container_worktree}/{dcfile_path.name}" - ] - + """Construct the DAPS command for native execution.""" raw_daps_cmd = dapstmpl.format( builddir=str(worktree_dir), - dcfile=str(worktree_dir / dcfile_path), + dcfile=str(dcfile_path), output=str(outputjson), ) return shlex.split(raw_daps_cmd) @@ -126,10 +109,6 @@ async def process_deliverable( context: DocBuildContext, deliverable: Deliverable, *, - repo_dir: Path, - tmp_repo_dir: Path, - base_cache_dir: Path, - meta_cache_dir: Path, dapstmpl: str, ) -> bool: """Process a single deliverable asynchronously. @@ -140,38 +119,31 @@ async def process_deliverable( :param context: The DocBuildContext containing environment configuration. :param deliverable: The Deliverable object to process. - :param repo_dir: The permanent repo path taken from the env - config ``paths.repo_dir`` - :param tmp_repo_dir: The temporary repo path taken from the env - config ``paths.tmp_repo_dir`` - :param base_cache_dir: The base path of the cache directory taken - from the env config ``paths.base_cache_dir`` - :param meta_cache_dir: The ath of the metadata directory taken - from the env config ``paths.meta_cache_dir`` - :param dapstmp: A template string with the daps command and potential - placeholders + :param dapstmpl: A template string with the daps command and potential + placeholders. :return: True if successful, False otherwise. - :raises ValueError: If required configuration paths are missing. """ log.info("> Processing deliverable: %s", deliverable.full_id) - if not context.envconfig: - log.error("Environment configuration is missing.") - return False + # Simplified initialization per reviewer feedback + env = context.envconfig + repo_dir = env.paths.repo_dir + tmp_repo_dir = env.paths.tmp_repo_dir + meta_cache_dir = Path(env.paths.meta_cache_dir) bare_repo_path = repo_dir / deliverable.git.slug if not bare_repo_path.is_dir(): log.error("Bare repository not found for %s at %s", deliverable.git.name, bare_repo_path) - return False + return False, deliverable - outputdir = Path(meta_cache_dir) / deliverable.relpath + outputdir = meta_cache_dir / deliverable.relpath outputdir.mkdir(parents=True, exist_ok=True) outputjson = outputdir / deliverable.dcfile try: async with PersistentOnErrorTemporaryDirectory( dir=str(tmp_repo_dir), - prefix=f"clone-{deliverable.productid}-{deliverable.dcfile}_", + prefix=f"clone-{deliverable.productid}-{deliverable.docsetid}-{deliverable.lang}-{deliverable.dcfile}_", ) as worktree_dir: mg = ManagedGitRepo(deliverable.git.url, repo_dir) if not await mg.clone_bare(): @@ -182,8 +154,14 @@ async def process_deliverable( except Exception as e: raise RuntimeError(f"Failed to create worktree for {deliverable.full_id}: {e}") from e + # Use absolute path within worktree to avoid DAPS "Missing DC-file" error + full_dcfile_path = Path(worktree_dir) / deliverable.subdir / deliverable.dcfile + cmd = _get_daps_command( - context, Path(worktree_dir), Path(deliverable.subdir) / deliverable.dcfile, outputjson, dapstmpl + Path(worktree_dir), + full_dcfile_path, + outputjson, + dapstmpl ) daps_proc = await asyncio.create_subprocess_exec( @@ -198,12 +176,11 @@ async def process_deliverable( _update_metadata_json(outputjson, deliverable) log.debug("Updated metadata JSON for %s", deliverable.full_id) - return True + return True, deliverable except Exception as e: log.error("Error processing %s: %s", deliverable.full_id, str(e)) - return False - + return False, deliverable async def update_repositories( deliverables: list[Deliverable], bare_repo_dir: Path @@ -241,28 +218,15 @@ async def process_doctype( :param root: The stitched XML node containing configuration. :param context: The DocBuildContext containing environment configuration. - :param doctypes: A tuple of Doctype objects to process. + :param doctype: The Doctype object to process. :param exitfirst: If True, stop processing on the first failure. - :return: True if all files passed validation, False otherwise + :param skip_repo_update: If True, do not fetch updates for the git repositories. + :return: A list of failed Deliverables. """ - # Here you would implement the logic to process the doctypes - # and create metadata files based on the stitchnode and context. - # This is a placeholder for the actual implementation. - # stdout.print(f'Processing doctypes: {doctype}') - # xpath = doctype.xpath() - # print("XPath: ", xpath) - # stdout.print(f'XPath: {doctype.xpath()}', markup=False) - env = context.envconfig - repo_dir: Path = env.paths.repo_dir - base_cache_dir: Path = env.paths.base_cache_dir - # We retrieve the path.meta_cache_dir and fall back to path.base_cache_dir - # if not available: - meta_cache_dir: Path = env.paths.meta_cache_dir - # Cloned temporary repo: - tmp_repo_dir: Path = env.paths.tmp_repo_dir + # Extract all deliverables associated with this doctype from the XML deliverables: list[Deliverable] = await asyncio.to_thread( get_deliverable_from_doctype, root, @@ -274,17 +238,14 @@ async def process_doctype( else: await update_repositories(deliverables, repo_dir) - # stdout.print(f'Found deliverables: {len(deliverables)}') dapsmetatmpl = env.build.daps.meta + # Cleaned up the call to match the new process_deliverable signature. + # Paths are now derived internally within process_deliverable from context. coroutines = [ process_deliverable( context, deliverable, - repo_dir=repo_dir, - tmp_repo_dir=tmp_repo_dir, - base_cache_dir=base_cache_dir, - meta_cache_dir=meta_cache_dir, dapstmpl=dapsmetatmpl, ) for deliverable in deliverables @@ -294,27 +255,37 @@ async def process_doctype( failed_deliverables: list[Deliverable] = [] if exitfirst: - # Fail-fast behavior + # Stop and cancel pending tasks on the first failure for task in asyncio.as_completed(tasks): - success, deliverable = await task - if not success: - failed_deliverables.append(deliverable) - # cancel others... + try: + # Success and deliverable come from the tuple returned by process_deliverable + success, deliverable = await task + if not success: + failed_deliverables.append(deliverable) + for t in tasks: + if not t.done(): + t.cancel() + break + except Exception as e: + log.error("Task failed unexpectedly: %s", e) + # If a task crashes, we still treat it as a failure for exitfirst for t in tasks: if not t.done(): t.cancel() - # Wait for cancellations to propagate - await asyncio.gather(*tasks, return_exceptions=True) - break # Exit the loop on first failure - + break else: - # Run all and collect all results + # Run all tasks and collect failures results = await asyncio.gather(*tasks, return_exceptions=True) - failed_deliverables.extend( - deliverable - for deliverable, success in zip(deliverables, results, strict=False) - if not success - ) + # We zip with deliverables to know which one failed if an Exception occurred + for deliverable, result in zip(deliverables, results, strict=False): + if isinstance(result, tuple): + success, res_deliverable = result + if not success: + failed_deliverables.append(res_deliverable) + elif isinstance(result, Exception): + # If the task crashed (e.g. RuntimeError), result is the Exception object + log.error("Error in task for %s: %s", deliverable.full_id, result) + failed_deliverables.append(deliverable) return failed_deliverables @@ -343,8 +314,10 @@ def _load_and_validate_documents( ) -> None: """Load JSON metadata files and append validated Document models to the manifest.""" for f in files: + # Path resolution for nested folders actual_file = f if f.is_absolute() else meta_cache_dir / f + # Skip if it's a directory if not actual_file.is_file(): continue @@ -354,14 +327,14 @@ def _load_and_validate_documents( loaded_doc_data = json.load(fh) if not loaded_doc_data: - log.warning("Empty metadata file %s", f) + log.error("Empty metadata file %s", f) continue doc_model = Document.model_validate(loaded_doc_data) manifest.documents.append(doc_model) - except Exception as e: - log.error("Error reading metadata file %s: %s", actual_file, e) + except (json.JSONDecodeError, ValidationError, OSError) as e: + log.error("Error processing metadata file %s: %s", actual_file, e) def store_productdocset_json( @@ -375,11 +348,19 @@ def store_productdocset_json( :param doctypes: Sequence of Doctype objects :param stitchnode: The stitched XML tree """ - meta_cache_dir = Path(context.envconfig.paths.meta_cache_dir) + # Aligning with reviewer suggestion to derive paths cleanly from context + env = context.envconfig + meta_cache_dir = Path(env.paths.meta_cache_dir) for doctype, docset, files in collect_files_flat(doctypes, meta_cache_dir): product = doctype.product.value - version_str = str(docset).removesuffix(".0") + + # Version Formatting (Safety Fix) + # Only remove .0 if the docset appears to be a version number + # and not a purely alphanumeric identifier (Fixing dangerous remsuffix) + version_str = str(docset) + if version_str.replace('.', '', 1).isdigit() and version_str.endswith(".0"): + version_str = version_str.removesuffix(".0") productxpath = f"./{doctype.product_xpath_segment()}" productnode = stitchnode.find(productxpath) @@ -394,7 +375,11 @@ def store_productdocset_json( # 2. Initialize Manifest manifest = Manifest( productname=productnode.find("name").text, - acronym=productnode.find("acronym").text if productnode.find("acronym") is not None else product.upper(), + acronym=( + productnode.find("acronym").text + if productnode.find("acronym") is not None + else product.upper() + ), version=version_str, lifecycle=docsetnode.attrib.get("lifecycle") or "", hide_productname=False, @@ -404,7 +389,7 @@ def store_productdocset_json( archives=[] ) - # 3. Load and validate documents using helper + # 3. Load and validate documents using helper (Specific exceptions handled inside) _load_and_validate_documents(files, meta_cache_dir, manifest) # 4. Save and export @@ -412,8 +397,9 @@ def store_productdocset_json( jsondir.mkdir(parents=True, exist_ok=True) jsonfile = jsondir / f"{docset}.json" + # Exporting with aliases to maintain parity with legacy JSON keys json_data = manifest.model_dump(by_alias=True, exclude_none=True) - with open(jsonfile, "w", encoding="utf-8") as jf: + with jsonfile.open("w", encoding="utf-8") as jf: json.dump(json_data, jf, indent=2) stdout.print(f" > Result: {jsonfile}") diff --git a/tests/cli/cmd_metadata/test_cmd_metadata.py b/tests/cli/cmd_metadata/test_cmd_metadata.py index f1817614..b15e5d15 100644 --- a/tests/cli/cmd_metadata/test_cmd_metadata.py +++ b/tests/cli/cmd_metadata/test_cmd_metadata.py @@ -370,17 +370,24 @@ async def test_process_deliverable_scenarios( # Create a mock context for the new function signature mock_context = MagicMock(spec=DocBuildContext) - mock_context.envconfig.paths.root_config_dir = Path("/tmp") + + # Link the mock context to the actual temporary paths created by the test setup + mock_context.envconfig.paths.repo_dir = setup_paths["repo_dir"] + mock_context.envconfig.paths.tmp_repo_dir = setup_paths["tmp_repo_dir"] + mock_context.envconfig.paths.meta_cache_dir = setup_paths["meta_cache_dir"] + mock_context.envconfig.paths.base_cache_dir = setup_paths["base_cache_dir"] result = await process_deliverable( context=mock_context, deliverable=deliverable, dapstmpl=dapstmpl, - **setup_paths ) # Assert - assert result is expected_result + success, res_deliverable = result + assert success is expected_result + assert res_deliverable == deliverable + if expected_log: assert any(expected_log in record.message for record in caplog.records) @@ -724,8 +731,8 @@ def test_store_productdocset_json_warns_on_empty_metadata( mock_context_with_config_dir, [doctype], stitchnode ) - # Expect a warning to be logged - mock_log.warning.assert_called_with("Empty metadata file %s", Path("empty.json")) + # Expect an error to be logged + mock_log.error.assert_called_with("Empty metadata file %s", Path("empty.json")) def test_store_productdocset_json_handles_read_error( From cab6a2011e459e31bdc6e1521622cb2e56e2fe52 Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Thu, 19 Feb 2026 13:08:40 +0530 Subject: [PATCH 4/7] feat #183: achieve manifest parity and refactor metadata logic Signed-off-by: sushant-suse --- src/docbuild/cli/cmd_metadata/metaprocess.py | 110 +++++++++---------- tests/cli/cmd_metadata/test_cmd_metadata.py | 4 +- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index 944eb000..a1a32c31 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -158,9 +158,9 @@ async def process_deliverable( full_dcfile_path = Path(worktree_dir) / deliverable.subdir / deliverable.dcfile cmd = _get_daps_command( - Path(worktree_dir), - full_dcfile_path, - outputjson, + Path(worktree_dir), + full_dcfile_path, + outputjson, dapstmpl ) @@ -205,6 +205,51 @@ async def update_repositories( return res +async def _run_tasks_fail_fast(tasks: list[asyncio.Task]) -> list[Deliverable]: + """Execute tasks and stop immediately on the first failure.""" + failed: list[Deliverable] = [] + for task in asyncio.as_completed(tasks): + try: + success, deliverable = await task + if not success: + failed.append(deliverable) + for t in tasks: + if not t.done(): + t.cancel() + break + except Exception as e: + log.error("Task failed unexpectedly: %s", e) + for t in tasks: + if not t.done(): + t.cancel() + break + return failed + + +async def _run_tasks_collect_all( + tasks: list[asyncio.Task], deliverables: list[Deliverable] +) -> list[Deliverable]: + """Execute all tasks and collect every failure encountered.""" + failed: list[Deliverable] = [] + results = await asyncio.gather(*tasks, return_exceptions=True) + for deliverable, result in zip(deliverables, results, strict=False): + if isinstance(result, tuple): + success, res_deliverable = result + if not success: + failed.append(res_deliverable) + elif isinstance(result, Exception): + log.error("Error in task for %s: %s", deliverable.full_id, result) + failed.append(deliverable) + return failed + + +async def _run_metadata_tasks( + tasks: list[asyncio.Task], deliverables: list[Deliverable], exitfirst: bool +) -> list[Deliverable]: + """Execute metadata tasks using either fail-fast or collect-all strategy.""" + if exitfirst: + return await _run_tasks_fail_fast(tasks) + return await _run_tasks_collect_all(tasks, deliverables) async def process_doctype( root: etree._ElementTree, @@ -226,68 +271,23 @@ async def process_doctype( env = context.envconfig repo_dir: Path = env.paths.repo_dir - # Extract all deliverables associated with this doctype from the XML deliverables: list[Deliverable] = await asyncio.to_thread( - get_deliverable_from_doctype, - root, - doctype, + get_deliverable_from_doctype, root, doctype ) - if skip_repo_update: # pragma: no cover + if skip_repo_update: log.info("Skipping repository %s updates as requested.", repo_dir) else: await update_repositories(deliverables, repo_dir) dapsmetatmpl = env.build.daps.meta - - # Cleaned up the call to match the new process_deliverable signature. - # Paths are now derived internally within process_deliverable from context. - coroutines = [ - process_deliverable( - context, - deliverable, - dapstmpl=dapsmetatmpl, - ) - for deliverable in deliverables + tasks = [ + asyncio.create_task(process_deliverable(context, d, dapstmpl=dapsmetatmpl)) + for d in deliverables ] - tasks = [asyncio.create_task(coro) for coro in coroutines] - failed_deliverables: list[Deliverable] = [] - - if exitfirst: - # Stop and cancel pending tasks on the first failure - for task in asyncio.as_completed(tasks): - try: - # Success and deliverable come from the tuple returned by process_deliverable - success, deliverable = await task - if not success: - failed_deliverables.append(deliverable) - for t in tasks: - if not t.done(): - t.cancel() - break - except Exception as e: - log.error("Task failed unexpectedly: %s", e) - # If a task crashes, we still treat it as a failure for exitfirst - for t in tasks: - if not t.done(): - t.cancel() - break - else: - # Run all tasks and collect failures - results = await asyncio.gather(*tasks, return_exceptions=True) - # We zip with deliverables to know which one failed if an Exception occurred - for deliverable, result in zip(deliverables, results, strict=False): - if isinstance(result, tuple): - success, res_deliverable = result - if not success: - failed_deliverables.append(res_deliverable) - elif isinstance(result, Exception): - # If the task crashed (e.g. RuntimeError), result is the Exception object - log.error("Error in task for %s: %s", deliverable.full_id, result) - failed_deliverables.append(deliverable) - - return failed_deliverables + # Complexity reduced by delegating task execution to the helper + return await _run_metadata_tasks(tasks, deliverables, exitfirst) def _apply_parity_fixes(descriptions: list, categories: list) -> None: """Apply wording and HTML parity fixes for legacy JSON consistency.""" diff --git a/tests/cli/cmd_metadata/test_cmd_metadata.py b/tests/cli/cmd_metadata/test_cmd_metadata.py index b15e5d15..2f8f7267 100644 --- a/tests/cli/cmd_metadata/test_cmd_metadata.py +++ b/tests/cli/cmd_metadata/test_cmd_metadata.py @@ -370,7 +370,7 @@ async def test_process_deliverable_scenarios( # Create a mock context for the new function signature mock_context = MagicMock(spec=DocBuildContext) - + # Link the mock context to the actual temporary paths created by the test setup mock_context.envconfig.paths.repo_dir = setup_paths["repo_dir"] mock_context.envconfig.paths.tmp_repo_dir = setup_paths["tmp_repo_dir"] @@ -387,7 +387,7 @@ async def test_process_deliverable_scenarios( success, res_deliverable = result assert success is expected_result assert res_deliverable == deliverable - + if expected_log: assert any(expected_log in record.message for record in caplog.records) From 501bdb683181a5c361a298b12a9328e1334429ca Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Thu, 19 Feb 2026 16:05:24 +0530 Subject: [PATCH 5/7] feat #183: achieve manifest parity and changed functions to public Signed-off-by: sushant-suse --- src/docbuild/cli/cmd_metadata/metaprocess.py | 52 ++++++++++---------- src/docbuild/models/manifest.py | 25 +++++----- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index a1a32c31..024307a7 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -76,7 +76,7 @@ def collect_files_flat( if files: yield dt, docset, files -def _get_daps_command( +def get_daps_command( worktree_dir: Path, dcfile_path: Path, outputjson: Path, @@ -91,7 +91,7 @@ def _get_daps_command( return shlex.split(raw_daps_cmd) -def _update_metadata_json(outputjson: Path, deliverable: Deliverable) -> None: +def update_metadata_json(outputjson: Path, deliverable: Deliverable) -> None: """Update the generated metadata JSON with deliverable-specific details.""" fmt = deliverable.format with edit_json(outputjson) as jsonconfig: @@ -110,7 +110,7 @@ async def process_deliverable( deliverable: Deliverable, *, dapstmpl: str, -) -> bool: +) -> tuple[bool, Deliverable]: """Process a single deliverable asynchronously. This function creates a temporary clone of the deliverable's repository, @@ -125,11 +125,11 @@ async def process_deliverable( """ log.info("> Processing deliverable: %s", deliverable.full_id) - # Simplified initialization per reviewer feedback + # Simplified initialization env = context.envconfig repo_dir = env.paths.repo_dir tmp_repo_dir = env.paths.tmp_repo_dir - meta_cache_dir = Path(env.paths.meta_cache_dir) + meta_cache_dir = env.paths.meta_cache_dir bare_repo_path = repo_dir / deliverable.git.slug if not bare_repo_path.is_dir(): @@ -157,7 +157,7 @@ async def process_deliverable( # Use absolute path within worktree to avoid DAPS "Missing DC-file" error full_dcfile_path = Path(worktree_dir) / deliverable.subdir / deliverable.dcfile - cmd = _get_daps_command( + cmd = get_daps_command( Path(worktree_dir), full_dcfile_path, outputjson, @@ -165,8 +165,10 @@ async def process_deliverable( ) daps_proc = await asyncio.create_subprocess_exec( - *cmd, stdin=asyncio.subprocess.DEVNULL, - stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, + *cmd, + stdin=asyncio.subprocess.DEVNULL, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, ) _, stderr_data = await daps_proc.communicate() @@ -174,7 +176,7 @@ async def process_deliverable( log.error("DAPS Error: %s", stderr_data.decode()) raise RuntimeError(f"DAPS failed for {deliverable.full_id}") - _update_metadata_json(outputjson, deliverable) + update_metadata_json(outputjson, deliverable) log.debug("Updated metadata JSON for %s", deliverable.full_id) return True, deliverable @@ -205,7 +207,8 @@ async def update_repositories( return res -async def _run_tasks_fail_fast(tasks: list[asyncio.Task]) -> list[Deliverable]: + +async def run_tasks_fail_fast(tasks: list[asyncio.Task]) -> list[Deliverable]: """Execute tasks and stop immediately on the first failure.""" failed: list[Deliverable] = [] for task in asyncio.as_completed(tasks): @@ -226,7 +229,7 @@ async def _run_tasks_fail_fast(tasks: list[asyncio.Task]) -> list[Deliverable]: return failed -async def _run_tasks_collect_all( +async def run_tasks_collect_all( tasks: list[asyncio.Task], deliverables: list[Deliverable] ) -> list[Deliverable]: """Execute all tasks and collect every failure encountered.""" @@ -248,8 +251,8 @@ async def _run_metadata_tasks( ) -> list[Deliverable]: """Execute metadata tasks using either fail-fast or collect-all strategy.""" if exitfirst: - return await _run_tasks_fail_fast(tasks) - return await _run_tasks_collect_all(tasks, deliverables) + return await run_tasks_fail_fast(tasks) + return await run_tasks_collect_all(tasks, deliverables) async def process_doctype( root: etree._ElementTree, @@ -307,7 +310,7 @@ def _apply_parity_fixes(descriptions: list, categories: list) -> None: trans.title = trans.title.replace("&", "&") -def _load_and_validate_documents( +def load_and_validate_documents( files: list[Path], meta_cache_dir: Path, manifest: Manifest @@ -348,19 +351,14 @@ def store_productdocset_json( :param doctypes: Sequence of Doctype objects :param stitchnode: The stitched XML tree """ - # Aligning with reviewer suggestion to derive paths cleanly from context env = context.envconfig meta_cache_dir = Path(env.paths.meta_cache_dir) for doctype, docset, files in collect_files_flat(doctypes, meta_cache_dir): product = doctype.product.value - # Version Formatting (Safety Fix) - # Only remove .0 if the docset appears to be a version number - # and not a purely alphanumeric identifier (Fixing dangerous remsuffix) + # Use the docset directly as the version string version_str = str(docset) - if version_str.replace('.', '', 1).isdigit() and version_str.endswith(".0"): - version_str = version_str.removesuffix(".0") productxpath = f"./{doctype.product_xpath_segment()}" productnode = stitchnode.find(productxpath) @@ -378,7 +376,7 @@ def store_productdocset_json( acronym=( productnode.find("acronym").text if productnode.find("acronym") is not None - else product.upper() + else product ), version=version_str, lifecycle=docsetnode.attrib.get("lifecycle") or "", @@ -389,18 +387,20 @@ def store_productdocset_json( archives=[] ) - # 3. Load and validate documents using helper (Specific exceptions handled inside) - _load_and_validate_documents(files, meta_cache_dir, manifest) + # 3. Load and validate documents using helper + load_and_validate_documents(files, meta_cache_dir, manifest) # 4. Save and export jsondir = meta_cache_dir / product jsondir.mkdir(parents=True, exist_ok=True) jsonfile = jsondir / f"{docset}.json" - # Exporting with aliases to maintain parity with legacy JSON keys - json_data = manifest.model_dump(by_alias=True, exclude_none=True) + # Exporting with aliases and INCLUDING defaults (dateModified, rank, isGate) + json_data = manifest.model_dump(by_alias=True) + with jsonfile.open("w", encoding="utf-8") as jf: - json.dump(json_data, jf, indent=2) + # ensure_ascii=False ensures raw UTF-8 (e.g., "á" instead of "\u00e1") + json.dump(json_data, jf, indent=2, ensure_ascii=False) stdout.print(f" > Result: {jsonfile}") Category.reset_rank() diff --git a/src/docbuild/models/manifest.py b/src/docbuild/models/manifest.py index 3b9ae5db..24f8a7ec 100644 --- a/src/docbuild/models/manifest.py +++ b/src/docbuild/models/manifest.py @@ -220,8 +220,9 @@ class SingleDocument(BaseModel): def serialize_date(self: Self, value: date | None, info: SerializationInfo) -> str: """Serialize date to 'YYYY-MM-DD' or an empty string if None.""" if value is None: - return "" - return value.isoformat() + return "" # This ensures the key exists as "" in JSON + # If it's already a string (from DAPS output), return it, otherwise isoformat + return value.isoformat() if hasattr(value, "isoformat") else str(value) class Product(BaseModel): @@ -273,24 +274,20 @@ class Document(BaseModel): tasks: list[str] = Field(default_factory=list) products: list[Product] = Field(default_factory=list) doctypes: list[str] = Field(default_factory=list, alias="docTypes") - isgated: bool = Field(default=False, alias="isGated", serialization_alias="isGated") - rank: int | None = Field(default=None) + isgated: bool = Field(default=False, alias="isGated", serialization_alias="isGate") + rank: int | str | None = Field(default=None) @field_validator("rank", mode="before") @classmethod def coerce_rank(cls: type[Self], value: str | int | None) -> int | None: - """Coerce rank from string to int, handling empty strings.""" - if isinstance(value, str): - if not value.strip(): - return None - return int(value) - return value + """Coerce rank to an integer, treating empty strings or None as None to match legacy parity.""" + if value is None or (isinstance(value, str) and not value.strip()): + return None + return int(value) @field_serializer("rank") - def serialize_rank( - self: Self, value: int | str | None, info: SerializationInfo - ) -> str: - """Serialize rank to an empty string if None.""" + def serialize_rank(self: Self, value: int | str | None, info: SerializationInfo) -> str: + """Serialize rank to an empty string if None to match legacy parity.""" if value is None: return "" return str(value) From 91789b8031fa944aa93e37351a4eb132bb8341a0 Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Thu, 19 Feb 2026 17:48:06 +0530 Subject: [PATCH 6/7] feat #183: resolve locale parity and finalize manifest audit tool, updated fragment file Signed-off-by: sushant-suse --- changelog.d/183.feature.rst | 2 +- src/docbuild/cli/cmd_metadata/metaprocess.py | 2 +- tools/audit_parity.py | 127 +++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) create mode 100755 tools/audit_parity.py diff --git a/changelog.d/183.feature.rst b/changelog.d/183.feature.rst index 98086c3d..84ce4cfb 100644 --- a/changelog.d/183.feature.rst +++ b/changelog.d/183.feature.rst @@ -1 +1 @@ -Implement manifest parity with legacy JSON portal configuration, including support for nested product metadata collection and containerized DAPS execution. \ No newline at end of file +Implement manifest parity with legacy JSON portal configuration, including support for nested product metadata collection, full locale (BCP 47) support, and a new manifest audit utility for structural verification. \ No newline at end of file diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index 024307a7..6e6628c4 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -103,7 +103,7 @@ def update_metadata_json(outputjson: Path, deliverable: Deliverable) -> None: if fmt.get("single-html"): doc["format"]["single-html"] = deliverable.singlehtml_path if not doc.get("lang"): - doc["lang"] = deliverable.language + doc["lang"] = deliverable.lang async def process_deliverable( context: DocBuildContext, diff --git a/tools/audit_parity.py b/tools/audit_parity.py new file mode 100755 index 00000000..83afc363 --- /dev/null +++ b/tools/audit_parity.py @@ -0,0 +1,127 @@ +#!/usr/bin/env -S uv run --frozen python +"""Smart Audit Tool for Document Manifest Parity. + +Compares a legacy (manual) JSON manifest against a generated JSON manifest +by matching documents based on normalized Titles and strict Languages. +""" + +import json +from pathlib import Path +import re +import sys + +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +console = Console() + + +def normalize_text(text: str) -> str: + """Lowercase and strip HTML/extra whitespace for fuzzy title matching.""" + if not text: + return "" + # Remove HTML tags + clean = re.sub(r"<[^>]+>", "", text) + # Collapse multiple whitespaces into one + return re.sub(r"\s+", " ", clean).strip().lower() + + +def get_doc_map(data: dict) -> dict: + """Create a map of {(normalized_title, lang): doc_dict}. + + Use the raw language code to ensure that discrepancies like 'en' vs 'en-us' + are caught and reported in the audit table. + """ + doc_map = {} + for doc_group in data.get("documents", []): + for doc in doc_group.get("docs", []): + title = doc.get("title", "Untitled") + lang = doc.get("lang", "unknown") + # Unique key: Normalized Title + Strict Language Code + key = (normalize_text(title), lang) + doc_map[key] = doc + return doc_map + + +def run_audit(manual_path: str, generated_path: str) -> None: + """Compare two manifest files and report discrepancies.""" + p_manual = Path(manual_path) + p_generated = Path(generated_path) + + try: + with open(manual_path, encoding="utf-8") as f: + manual_data = json.load(f) + with open(generated_path, encoding="utf-8") as f: + gen_data = json.load(f) + except Exception as e: + console.print(f"[bold red]Error loading files:[/bold red] {e}") + return + + manual_docs = get_doc_map(manual_data) + gen_docs = get_doc_map(gen_data) + + console.print( + Panel( + f"Legacy: [bold magenta]{p_manual.name}[/bold magenta]\n" + f"Generated: [bold green]{p_generated.name}[/bold green]", + title="[bold cyan]Manifest Comparison Audit[/bold cyan]", + subtitle=f"Comparing {p_manual.parent.name} structure", + ) + ) + + # Fields to verify for structural and content parity + fields_to_check = [ + "lang", + "title", + "description", + "dateModified", + "rank", + "isGate", + "dcfile", + "rootid", + ] + + table = Table(title="Field Discrepancies", show_header=True, header_style="bold blue") + table.add_column("Document Match", style="italic") + table.add_column("Field") + table.add_column("Legacy Value", style="red") + table.add_column("Generated Value", style="green") + + diff_found = False + + # Check for differences in matching documents + for key, m_doc in manual_docs.items(): + if key in gen_docs: + g_doc = gen_docs[key] + for field in fields_to_check: + # Normalize values to strings for comparison + m_val = str(m_doc.get(field, "")).strip() + g_val = str(g_doc.get(field, "")).strip() + + if m_val != g_val: + table.add_row(m_doc.get("title"), field, m_val, g_val) + diff_found = True + else: + # Document exists in Legacy but could not be matched in Generated + table.add_row(m_doc.get("title"), "FILE", "MISSING", "") + diff_found = True + + # Check for extra documents in Generated that aren't in Legacy + for key, g_doc in gen_docs.items(): + if key not in manual_docs: + table.add_row(g_doc.get("title"), "FILE", "", "NEW IN GENERATED") + diff_found = True + + if not diff_found: + console.print("[bold green]✅ 100% Parity found![/bold green]") + else: + console.print(table) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + console.print("[yellow]Usage:[/yellow] ./tools/audit_parity.py ") + sys.exit(1) + + run_audit(sys.argv[1], sys.argv[2]) From 3deced64beeb6f8c606b4b51cde2e64c5206ca8a Mon Sep 17 00:00:00 2001 From: sushant-suse Date: Thu, 19 Feb 2026 21:21:55 +0530 Subject: [PATCH 7/7] feat #183: resolve private function and added TODO Signed-off-by: sushant-suse --- src/docbuild/cli/cmd_metadata/metaprocess.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/docbuild/cli/cmd_metadata/metaprocess.py b/src/docbuild/cli/cmd_metadata/metaprocess.py index 6e6628c4..7df68b87 100644 --- a/src/docbuild/cli/cmd_metadata/metaprocess.py +++ b/src/docbuild/cli/cmd_metadata/metaprocess.py @@ -292,8 +292,10 @@ async def process_doctype( # Complexity reduced by delegating task execution to the helper return await _run_metadata_tasks(tasks, deliverables, exitfirst) -def _apply_parity_fixes(descriptions: list, categories: list) -> None: +def apply_parity_fixes(descriptions: list, categories: list) -> None: """Apply wording and HTML parity fixes for legacy JSON consistency.""" + # TODO: These strings are hard-coded for legacy parity but should be moved to + # Docserv config files to allow for proper translation and localization. legacy_tail = ( "

The default view of this page is the ```Table of Contents``` sorting order. " "To search for a particular document, you can narrow down the results using the " @@ -368,7 +370,7 @@ def store_productdocset_json( # 1. Capture and Clean Descriptions/Categories using helper descriptions = list(Description.from_xml_node(productnode)) categories = list(Category.from_xml_node(productnode)) - _apply_parity_fixes(descriptions, categories) + apply_parity_fixes(descriptions, categories) # 2. Initialize Manifest manifest = Manifest(