1+ """JSON structure analyzer with type detection and schema inference.
2+
3+ This module analyzes JSON data structures to detect types, optional fields,
4+ conflicts, and generates comprehensive structural summaries.
5+ """
6+
17from collections import Counter
8+ from typing import Any
9+
210import dateparser
311from rich .progress import Progress , SpinnerColumn , TextColumn
412
13+ from .logging_config import get_logger
14+
15+ logger = get_logger (__name__ )
16+
517
6- def detect_timestamp (value ):
18+ def detect_timestamp (value : Any ) -> bool :
19+ """Detect if a string value is a timestamp.
20+
21+ Args:
22+ value: Value to check.
23+
24+ Returns:
25+ True if the value is a parseable timestamp, False otherwise.
26+ """
727 if not isinstance (value , str ) or len (value ) < 4 :
828 return False
9- parsed = dateparser .parse (value )
10- return parsed is not None
1129
30+ try :
31+ parsed = dateparser .parse (value )
32+ return parsed is not None
33+ except Exception :
34+ return False
35+
36+
37+ def analyze_json (data : Any ) -> dict [str , Any ]:
38+ """Analyze JSON structure and return detailed metadata.
39+
40+ This function performs deep structural analysis of JSON data, identifying:
41+ - Data types and their distribution
42+ - Optional and required fields
43+ - Type conflicts across similar structures
44+ - Nested object and array patterns
45+
46+ Args:
47+ data: JSON data to analyze (dict, list, or primitive type).
48+
49+ Returns:
50+ Dictionary containing analysis summary with structure, types, and conflicts.
51+
52+ Example:
53+ >>> data = {"users": [{"id": 1, "name": "Alice"}]}
54+ >>> analysis = analyze_json(data)
55+ >>> print(analysis['type'])
56+ 'object'
57+ """
58+ logger .info ("Starting JSON analysis" )
1259
13- def analyze_json (data ):
1460 with Progress (
1561 SpinnerColumn (),
1662 TextColumn ("[progress.description]{task.description}" ),
1763 console = None ,
1864 transient = True ,
1965 ) as progress :
20-
2166 task = progress .add_task ("[cyan]Analyzing JSON..." , total = None )
2267
23- def analyze_node (node ):
68+ def analyze_node (node : Any ) -> dict [str , Any ]:
69+ """Recursively analyze a node in the JSON structure."""
2470 if isinstance (node , dict ):
2571 children = {}
2672 for key , val in node .items ():
@@ -60,17 +106,12 @@ def analyze_node(node):
60106
61107 # List of lists
62108 if all (e ["type" ] == "list" for e in element_summaries ):
63- # Merge list structures recursively
64109 merged_list = merge_list_summaries (element_summaries )
65- return {
66- "type" : "list" ,
67- "child" : merged_list ,
68- }
110+ return {"type" : "list" , "child" : merged_list }
69111
70112 return {"type" : "list" , "child_type" : "mixed" }
71113
72114 elif node is None :
73- # Explicitly handle None - mark as unknown but with a flag
74115 return {"type" : "unknown" , "is_none" : True }
75116
76117 else :
@@ -82,8 +123,11 @@ def analyze_node(node):
82123 else :
83124 return {"type" : type (node ).__name__ }
84125
85- def merge_object_summaries (summaries ):
86- key_structures = {}
126+ def merge_object_summaries (
127+ summaries : list [dict [str , Any ]],
128+ ) -> tuple [dict [str , Any ], dict [str , list [str ]]]:
129+ """Merge multiple object summaries, detecting optional fields and conflicts."""
130+ key_structures : dict [str , list ] = {}
87131 key_counts = Counter ()
88132 key_none_counts = Counter ()
89133 total = len (summaries )
@@ -95,45 +139,38 @@ def merge_object_summaries(summaries):
95139 key_counts [key ] += 1
96140 seen_keys .add (key )
97141
98- # Track if this value is None/unknown
99142 if val .get ("type" ) == "unknown" :
100143 key_none_counts [key ] += 1
101144
102145 if key not in key_structures :
103146 key_structures [key ] = []
104147 key_structures [key ].append (val )
105148
106- merged = {}
107- conflicts = {}
149+ merged : dict [ str , Any ] = {}
150+ conflicts : dict [ str , list [ str ]] = {}
108151
109152 for key , structures in key_structures .items ():
110153 count = key_counts [key ]
111154 none_count = key_none_counts [key ]
112155
113- # Field is optional if:
114- # 1. Missing from some objects (count < total)
115- # 2. Has None in some objects (none_count > 0)
156+ # Field is optional if missing or has None values
116157 optional = (count < total ) or (none_count > 0 )
117158
118- # Filter out None/unknown types to find concrete types
159+ # Filter out None/unknown types
119160 concrete_structures = [
120161 s for s in structures if s .get ("type" ) != "unknown"
121162 ]
122163
123- # If we have concrete types, use those; otherwise use all structures
124164 working_structures = (
125165 concrete_structures if concrete_structures else structures
126166 )
127167
128- # Get unique types from working structures
129168 types = {s ["type" ] for s in working_structures }
130169
131170 if len (types ) == 1 :
132- # Single type (possibly with None values)
133171 structure_type = list (types )[0 ]
134172
135173 if structure_type == "object" :
136- # Recursively merge object structures
137174 merged_children , child_conflicts = merge_object_summaries (
138175 working_structures
139176 )
@@ -146,7 +183,6 @@ def merge_object_summaries(summaries):
146183 merged [key ]["conflicts" ] = child_conflicts
147184
148185 elif structure_type == "list" :
149- # Merge list structures
150186 merged_list = merge_list_summaries (working_structures )
151187 merged [key ] = {
152188 "type" : "list" ,
@@ -155,21 +191,19 @@ def merge_object_summaries(summaries):
155191 }
156192
157193 else :
158- # Primitive type (possibly with None)
159194 merged [key ] = {"type" : structure_type , "optional" : optional }
160195
161196 elif len (types ) > 1 :
162- # Multiple different types = real conflict
163197 merged [key ] = {"type" : "conflict" , "optional" : optional }
164198 conflicts [key ] = list (types )
165199
166200 else :
167- # Should not happen, but handle gracefully
168201 merged [key ] = {"type" : "unknown" , "optional" : optional }
169202
170203 return merged , conflicts
171204
172- def merge_list_summaries (summaries ):
205+ def merge_list_summaries (summaries : list [dict [str , Any ]]) -> dict [str , Any ]:
206+ """Merge multiple list summaries."""
173207 child_types = set ()
174208 child_structures = []
175209
@@ -180,14 +214,12 @@ def merge_list_summaries(summaries):
180214 child_structures .append (summary ["child" ])
181215
182216 if child_structures :
183- # All lists contain complex structures
184217 structure_types = {s ["type" ] for s in child_structures }
185218
186219 if len (structure_types ) == 1 :
187220 structure_type = list (structure_types )[0 ]
188221
189222 if structure_type == "object" :
190- # Merge object structures within lists
191223 merged_children , child_conflicts = merge_object_summaries (
192224 child_structures
193225 )
@@ -200,14 +232,12 @@ def merge_list_summaries(summaries):
200232 },
201233 }
202234 elif structure_type == "list" :
203- # Nested lists
204235 merged_nested = merge_list_summaries (child_structures )
205236 return {"type" : "list" , "child" : merged_nested }
206237
207238 return {"type" : "list" , "child_type" : "mixed_complex" }
208239
209240 elif child_types :
210- # Simple child types
211241 if len (child_types ) == 1 :
212242 return {"type" : "list" , "child_type" : list (child_types )[0 ]}
213243 else :
@@ -218,45 +248,6 @@ def merge_list_summaries(summaries):
218248
219249 return {"type" : "list" , "child_type" : "unknown" }
220250
221- # Start the analysis
222251 result = analyze_node (data )
252+ logger .info ("JSON analysis completed successfully" )
223253 return result
224-
225-
226- if __name__ == "__main__" :
227- from rich import print as rprint
228- from rich .pretty import pretty_repr
229-
230- test_data = {
231- "users" : [
232- {
233- "id" : 1 ,
234- "name" : "Alice" ,
235- "profile" : {
236- "age" : 30 ,
237- "settings" : {"theme" : "dark" , "notifications" : True },
238- },
239- "tags" : ["admin" , "user" ],
240- "last_login" : "2024-07-15T12:30:00Z" ,
241- },
242- {
243- "id" : 2 ,
244- "name" : "Bob" ,
245- "profile" : {
246- "age" : 25 ,
247- "settings" : {
248- "theme" : "light" ,
249- "notifications" : False ,
250- "language" : "en" ,
251- },
252- },
253- "tags" : ["user" ],
254- 255- "last_login" : "not a date" ,
256- },
257- ],
258- "metadata" : {"total" : 2 , "created" : "2024-01-01" },
259- }
260-
261- summary = analyze_json (test_data )
262- rprint (pretty_repr (summary ))
0 commit comments