11from collections import Counter
22import dateparser
3+ from rich .progress import Progress , SpinnerColumn , TextColumn
34
45
56def detect_timestamp (value ):
@@ -10,167 +11,181 @@ def detect_timestamp(value):
1011
1112
1213def analyze_json (data ):
13- def analyze_node (node ):
14- if isinstance (node , dict ):
15- children = {}
16- for key , val in node .items ():
17- children [key ] = analyze_node (val )
18- return {"type" : "object" , "children" : children }
19- elif isinstance (node , list ):
20- # Skip empty or null-only lists
21- non_empty_items = [item for item in node if item not in (None , {}, [], "" )]
22- if not non_empty_items :
23- return {"type" : "list" , "child_type" : "unknown" }
24-
25- sample = non_empty_items [:20 ]
26- element_summaries = [analyze_node (item ) for item in sample ]
27- types = {e ["type" ] for e in element_summaries }
28-
29- # List of primitives
30- if len (types ) == 1 and all (
31- e ["type" ] not in {"object" , "list" } for e in element_summaries
32- ):
33- return {"type" : "list" , "child_type" : types .pop ()}
34-
35- # List of objects
36- if all (e ["type" ] == "object" for e in element_summaries ):
37- merged , conflicts = merge_object_summaries (element_summaries )
38- return {
39- "type" : "list" ,
40- "child" : {
41- "type" : "object" ,
42- "children" : merged ,
43- "conflicts" : conflicts ,
44- },
45- }
46-
47- # List of lists
48- if all (e ["type" ] == "list" for e in element_summaries ):
49- # Merge list structures recursively
50- merged_list = merge_list_summaries (element_summaries )
51- return {
52- "type" : "list" ,
53- "child" : merged_list ,
54- }
55-
56- return {"type" : "list" , "child_type" : "mixed" }
57- else :
58- if isinstance (node , str ):
59- if detect_timestamp (node ):
60- return {"type" : "timestamp" }
61- else :
62- return {"type" : "str" }
63- else :
64- return {"type" : type (node ).__name__ }
65-
66- def merge_object_summaries (summaries ):
67- key_structures = {}
68- key_counts = Counter ()
69- total = len (summaries )
70-
71- for summary in summaries :
72- for key , val in summary .get ("children" , {}).items ():
73- key_counts [key ] += 1
74- if key not in key_structures :
75- key_structures [key ] = []
76- key_structures [key ].append (val )
77-
78- merged = {}
79- conflicts = {}
80-
81- for key , structures in key_structures .items ():
82- count = key_counts [key ]
83- optional = count < total
84-
85- # Get unique types for this key
86- types = {s ["type" ] for s in structures }
87-
88- if len (types ) == 1 :
89- # All structures have the same type
90- structure_type = list (types )[0 ]
91-
92- if structure_type == "object" :
93- # Recursively merge object structures
94- merged_children , child_conflicts = merge_object_summaries (
95- structures
96- )
97- merged [key ] = {
98- "type" : "object" ,
99- "children" : merged_children ,
100- "optional" : optional ,
101- }
102- if child_conflicts :
103- merged [key ]["conflicts" ] = child_conflicts
104-
105- elif structure_type == "list" :
106- # Merge list structures
107- merged_list = merge_list_summaries (structures )
108- merged [key ] = {
109- "type" : "list" ,
110- "optional" : optional ,
111- ** {k : v for k , v in merged_list .items () if k != "type" },
112- }
113-
114- else :
115- # Primitive type
116- merged [key ] = {"type" : structure_type , "optional" : optional }
117- else :
118- # Type conflict
119- merged [key ] = {"type" : "conflict" , "optional" : optional }
120- conflicts [key ] = list (types )
121-
122- return merged , conflicts
123-
124- def merge_list_summaries (summaries ):
125- child_types = set ()
126- child_structures = []
127-
128- for summary in summaries :
129- if "child_type" in summary :
130- child_types .add (summary ["child_type" ])
131- elif "child" in summary :
132- child_structures .append (summary ["child" ])
133-
134- if child_structures :
135- # All lists contain complex structures
136- structure_types = {s ["type" ] for s in child_structures }
137-
138- if len (structure_types ) == 1 :
139- structure_type = list (structure_types )[0 ]
140-
141- if structure_type == "object" :
142- # Merge object structures within lists
143- merged_children , child_conflicts = merge_object_summaries (
144- child_structures
145- )
14+ with Progress (
15+ SpinnerColumn (),
16+ TextColumn ("[progress.description]{task.description}" ),
17+ console = None ,
18+ transient = True ,
19+ ) as progress :
20+
21+ task = progress .add_task ("[cyan]Analyzing JSON..." , total = None )
22+
23+ def analyze_node (node ):
24+ if isinstance (node , dict ):
25+ children = {}
26+ for key , val in node .items ():
27+ progress .update (task , advance = 1 )
28+ children [key ] = analyze_node (val )
29+ return {"type" : "object" , "children" : children }
30+ elif isinstance (node , list ):
31+ # Skip empty or null-only lists
32+ non_empty_items = [
33+ item for item in node if item not in (None , {}, [], "" )
34+ ]
35+ if not non_empty_items :
36+ return {"type" : "list" , "child_type" : "unknown" }
37+
38+ sample = non_empty_items [:20 ]
39+ element_summaries = [analyze_node (item ) for item in sample ]
40+ types = {e ["type" ] for e in element_summaries }
41+
42+ # List of primitives
43+ if len (types ) == 1 and all (
44+ e ["type" ] not in {"object" , "list" } for e in element_summaries
45+ ):
46+ return {"type" : "list" , "child_type" : types .pop ()}
47+
48+ # List of objects
49+ if all (e ["type" ] == "object" for e in element_summaries ):
50+ merged , conflicts = merge_object_summaries (element_summaries )
14651 return {
14752 "type" : "list" ,
14853 "child" : {
14954 "type" : "object" ,
150- "children" : merged_children ,
151- "conflicts" : child_conflicts ,
55+ "children" : merged ,
56+ "conflicts" : conflicts ,
15257 },
15358 }
154- elif structure_type == "list" :
155- # Nested lists
156- merged_nested = merge_list_summaries (child_structures )
157- return {"type" : "list" , "child" : merged_nested }
15859
159- return {"type" : "list" , "child_type" : "mixed_complex" }
60+ # List of lists
61+ if all (e ["type" ] == "list" for e in element_summaries ):
62+ # Merge list structures recursively
63+ merged_list = merge_list_summaries (element_summaries )
64+ return {
65+ "type" : "list" ,
66+ "child" : merged_list ,
67+ }
16068
161- elif child_types :
162- # Simple child types
163- if len (child_types ) == 1 :
164- return {"type" : "list" , "child_type" : list (child_types )[0 ]}
69+ return {"type" : "list" , "child_type" : "mixed" }
16570 else :
166- return {
167- "type" : "list" ,
168- "child_type" : f"mixed: { ', ' .join (sorted (child_types ))} " ,
169- }
71+ if isinstance (node , str ):
72+ if detect_timestamp (node ):
73+ return {"type" : "timestamp" }
74+ else :
75+ return {"type" : "str" }
76+ else :
77+ return {"type" : type (node ).__name__ }
78+
79+ def merge_object_summaries (summaries ):
80+ key_structures = {}
81+ key_counts = Counter ()
82+ total = len (summaries )
83+
84+ for summary in summaries :
85+ for key , val in summary .get ("children" , {}).items ():
86+ key_counts [key ] += 1
87+ if key not in key_structures :
88+ key_structures [key ] = []
89+ key_structures [key ].append (val )
90+
91+ merged = {}
92+ conflicts = {}
93+
94+ for key , structures in key_structures .items ():
95+ count = key_counts [key ]
96+ optional = count < total
97+
98+ # Get unique types for this key
99+ types = {s ["type" ] for s in structures }
100+
101+ if len (types ) == 1 :
102+ # All structures have the same type
103+ structure_type = list (types )[0 ]
104+
105+ if structure_type == "object" :
106+ # Recursively merge object structures
107+ merged_children , child_conflicts = merge_object_summaries (
108+ structures
109+ )
110+ merged [key ] = {
111+ "type" : "object" ,
112+ "children" : merged_children ,
113+ "optional" : optional ,
114+ }
115+ if child_conflicts :
116+ merged [key ]["conflicts" ] = child_conflicts
117+
118+ elif structure_type == "list" :
119+ # Merge list structures
120+ merged_list = merge_list_summaries (structures )
121+ merged [key ] = {
122+ "type" : "list" ,
123+ "optional" : optional ,
124+ ** {k : v for k , v in merged_list .items () if k != "type" },
125+ }
126+
127+ else :
128+ # Primitive type
129+ merged [key ] = {"type" : structure_type , "optional" : optional }
130+ else :
131+ # Type conflict
132+ merged [key ] = {"type" : "conflict" , "optional" : optional }
133+ conflicts [key ] = list (types )
134+
135+ return merged , conflicts
136+
137+ def merge_list_summaries (summaries ):
138+ child_types = set ()
139+ child_structures = []
140+
141+ for summary in summaries :
142+ if "child_type" in summary :
143+ child_types .add (summary ["child_type" ])
144+ elif "child" in summary :
145+ child_structures .append (summary ["child" ])
146+
147+ if child_structures :
148+ # All lists contain complex structures
149+ structure_types = {s ["type" ] for s in child_structures }
150+
151+ if len (structure_types ) == 1 :
152+ structure_type = list (structure_types )[0 ]
153+
154+ if structure_type == "object" :
155+ # Merge object structures within lists
156+ merged_children , child_conflicts = merge_object_summaries (
157+ child_structures
158+ )
159+ return {
160+ "type" : "list" ,
161+ "child" : {
162+ "type" : "object" ,
163+ "children" : merged_children ,
164+ "conflicts" : child_conflicts ,
165+ },
166+ }
167+ elif structure_type == "list" :
168+ # Nested lists
169+ merged_nested = merge_list_summaries (child_structures )
170+ return {"type" : "list" , "child" : merged_nested }
171+
172+ return {"type" : "list" , "child_type" : "mixed_complex" }
173+
174+ elif child_types :
175+ # Simple child types
176+ if len (child_types ) == 1 :
177+ return {"type" : "list" , "child_type" : list (child_types )[0 ]}
178+ else :
179+ return {
180+ "type" : "list" ,
181+ "child_type" : f"mixed: { ', ' .join (sorted (child_types ))} " ,
182+ }
170183
171- return {"type" : "list" , "child_type" : "unknown" }
184+ return {"type" : "list" , "child_type" : "unknown" }
172185
173- return analyze_node (data )
186+ # Start the analysis
187+ result = analyze_node (data )
188+ return result
174189
175190
176191if __name__ == "__main__" :
0 commit comments