44
55
66def detect_timestamp (value ):
7- if not isinstance (value , str ):
7+ if not isinstance (value , str ) or len ( value ) < 4 :
88 return False
99 parsed = dateparser .parse (value )
1010 return parsed is not None
@@ -27,6 +27,7 @@ def analyze_node(node):
2727 progress .update (task , advance = 1 )
2828 children [key ] = analyze_node (val )
2929 return {"type" : "object" , "children" : children }
30+
3031 elif isinstance (node , list ):
3132 # Skip empty or null-only lists
3233 non_empty_items = [
@@ -67,6 +68,11 @@ def analyze_node(node):
6768 }
6869
6970 return {"type" : "list" , "child_type" : "mixed" }
71+
72+ elif node is None :
73+ # Explicitly handle None - mark as unknown but with a flag
74+ return {"type" : "unknown" , "is_none" : True }
75+
7076 else :
7177 if isinstance (node , str ):
7278 if detect_timestamp (node ):
@@ -79,11 +85,20 @@ def analyze_node(node):
7985 def merge_object_summaries (summaries ):
8086 key_structures = {}
8187 key_counts = Counter ()
88+ key_none_counts = Counter ()
8289 total = len (summaries )
8390
8491 for summary in summaries :
92+ seen_keys = set ()
93+
8594 for key , val in summary .get ("children" , {}).items ():
8695 key_counts [key ] += 1
96+ seen_keys .add (key )
97+
98+ # Track if this value is None/unknown
99+ if val .get ("type" ) == "unknown" :
100+ key_none_counts [key ] += 1
101+
87102 if key not in key_structures :
88103 key_structures [key ] = []
89104 key_structures [key ].append (val )
@@ -93,19 +108,34 @@ def merge_object_summaries(summaries):
93108
94109 for key , structures in key_structures .items ():
95110 count = key_counts [key ]
96- optional = count < total
111+ none_count = key_none_counts [key ]
112+
113+ # Field is optional if:
114+ # 1. Missing from some objects (count < total)
115+ # 2. Has None in some objects (none_count > 0)
116+ optional = (count < total ) or (none_count > 0 )
97117
98- # Get unique types for this key
99- types = {s ["type" ] for s in structures }
118+ # Filter out None/unknown types to find concrete types
119+ concrete_structures = [
120+ s for s in structures if s .get ("type" ) != "unknown"
121+ ]
122+
123+ # If we have concrete types, use those; otherwise use all structures
124+ working_structures = (
125+ concrete_structures if concrete_structures else structures
126+ )
127+
128+ # Get unique types from working structures
129+ types = {s ["type" ] for s in working_structures }
100130
101131 if len (types ) == 1 :
102- # All structures have the same type
132+ # Single type (possibly with None values)
103133 structure_type = list (types )[0 ]
104134
105135 if structure_type == "object" :
106136 # Recursively merge object structures
107137 merged_children , child_conflicts = merge_object_summaries (
108- structures
138+ working_structures
109139 )
110140 merged [key ] = {
111141 "type" : "object" ,
@@ -117,21 +147,26 @@ def merge_object_summaries(summaries):
117147
118148 elif structure_type == "list" :
119149 # Merge list structures
120- merged_list = merge_list_summaries (structures )
150+ merged_list = merge_list_summaries (working_structures )
121151 merged [key ] = {
122152 "type" : "list" ,
123153 "optional" : optional ,
124154 ** {k : v for k , v in merged_list .items () if k != "type" },
125155 }
126156
127157 else :
128- # Primitive type
158+ # Primitive type (possibly with None)
129159 merged [key ] = {"type" : structure_type , "optional" : optional }
130- else :
131- # Type conflict
160+
161+ elif len (types ) > 1 :
162+ # Multiple different types = real conflict
132163 merged [key ] = {"type" : "conflict" , "optional" : optional }
133164 conflicts [key ] = list (types )
134165
166+ else :
167+ # Should not happen, but handle gracefully
168+ merged [key ] = {"type" : "unknown" , "optional" : optional }
169+
135170 return merged , conflicts
136171
137172 def merge_list_summaries (summaries ):
0 commit comments