22
33from collections .abc import Generator
44from datetime import date
5+ import logging
56from typing import ClassVar , Self
67
78from lxml import etree
1112 # model_validator,
1213 Field ,
1314 SerializationInfo ,
15+ ValidationInfo ,
1416 field_serializer ,
1517 field_validator ,
1618)
1719
1820from ..models .language import LanguageCode
1921from ..models .lifecycle import LifecycleFlag
2022
23+ log = logging .getLogger (__name__ )
2124
2225class Description (BaseModel ):
2326 """Represents a description for a product/docset.
@@ -33,7 +36,7 @@ class Description(BaseModel):
3336
3437 lang : LanguageCode
3538 default : bool
36- description : str
39+ description : str = Field ( default = "" )
3740
3841 @field_serializer ("lang" )
3942 def serialize_lang (self : Self , value : LanguageCode , info : SerializationInfo ) -> str :
@@ -179,7 +182,7 @@ class DocumentFormat(BaseModel):
179182 }
180183 """
181184
182- html : str
185+ html : str = Field ( default = "" )
183186 pdf : str | None = Field (default = None , exclude_if = lambda v : v is None or v == "" )
184187 single_html : str | None = Field (
185188 default = None , alias = "single-html" , exclude_if = lambda v : v is None or v == ""
@@ -207,21 +210,37 @@ class SingleDocument(BaseModel):
207210 }
208211 """
209212
213+ # Define dcfile first so it is available to other validators in 'info.data'
214+ dcfile : str = Field (default = "" )
210215 lang : str | None = None
211- title : str
216+ title : str | None = Field ( default = None )
212217 subtitle : str = Field (default = "" )
213- description : str
214- dcfile : str
218+ description : str = Field (default = "" )
215219 rootid : str = Field (default = "" )
216- format : DocumentFormat
220+ format : DocumentFormat = Field ( default_factory = DocumentFormat )
217221 datemodified : date | None = Field (default = None , serialization_alias = "dateModified" )
218222
223+ @field_validator ("title" )
224+ @classmethod
225+ def warn_missing_title (cls , v : str | None , info : ValidationInfo ) -> str | None :
226+ """Check for missing titles and log a warning with the document origin."""
227+ # info.data contains fields defined before 'title'
228+ origin = info .data .get ("dcfile" , "Unknown Origin" )
229+ lang = info .data .get ("lang" , "Unknown Lang" )
230+
231+ # Catch both None and empty strings
232+ if not v :
233+ log .warning (
234+ "Metadata Integrity: Document missing title. Origin: %s (Lang: %s)" ,
235+ origin , lang
236+ )
237+ return v
238+
219239 @field_serializer ("datemodified" )
220- def serialize_date (self : Self , value : date | None , info : SerializationInfo ) -> str :
240+ def serialize_date (self : Self , value : date | None , _info : SerializationInfo ) -> str :
221241 """Serialize date to 'YYYY-MM-DD' or an empty string if None."""
222242 if value is None :
223- return "" # This ensures the key exists as "" in JSON
224- # If it's already a string (from DAPS output), return it, otherwise isoformat
243+ return ""
225244 return value .isoformat () if hasattr (value , "isoformat" ) else str (value )
226245
227246
0 commit comments