Skip to content

Commit 448fea2

Browse files
authored
pre-process and trim data if it is large, for schema inference (#739)
* pre-process and trim data if it is large, for schema inference * add parameter minObjectPropertyCountToPreserve for schema inference input trimming * apply formatting changes --------- Co-authored-by: Logende <[email protected]>
1 parent 8889af4 commit 448fea2

6 files changed

Lines changed: 123 additions & 10 deletions

File tree

meta_configurator/src/components/panels/gui-editor/configTreeNodeResolver.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ export class ConfigTreeNodeResolver {
196196
);
197197
}
198198

199-
if (children.length > settings.value.performance.maximumShownChildrenInGuiEditor) {
200-
children = children.slice(0, settings.value.performance.maximumShownChildrenInGuiEditor);
199+
if (children.length > settings.value.performance.maxShownChildrenInGuiEditor) {
200+
children = children.slice(0, settings.value.performance.maxShownChildrenInGuiEditor);
201201
}
202202

203203
return children;
@@ -462,8 +462,8 @@ export class ConfigTreeNodeResolver {
462462
});
463463
}
464464
let exceedsChildrenLimit = false;
465-
if (children.length > settings.value.performance.maximumShownChildrenInGuiEditor) {
466-
children = children.slice(0, settings.value.performance.maximumShownChildrenInGuiEditor);
465+
if (children.length > settings.value.performance.maxShownChildrenInGuiEditor) {
466+
children = children.slice(0, settings.value.performance.maxShownChildrenInGuiEditor);
467467
exceedsChildrenLimit = true;
468468
}
469469
if (this.shouldAddAddItemNode(schema, data) && !exceedsChildrenLimit) {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import {inferSchema} from '@jsonhero/schema-infer';
22
import type {JsonSchemaType} from '@/schema/jsonSchemaType';
3+
import {trimDataToMaxSize} from '@/utility/trimData';
4+
import {useSettings} from '@/settings/useSettings';
35

46
export function inferJsonSchema(sampleData: any): JsonSchemaType {
7+
// trim sample data if needed
8+
const maximumSizeInKiB = useSettings().value.performance.maxDocumentSizeForSchemaInference / 1024; // convert bytes to KiB
9+
const minObjectPropertyCountToPreserve =
10+
useSettings().value.performance.minObjectPropertyCountToPreserve;
11+
sampleData = trimDataToMaxSize(sampleData, maximumSizeInKiB, minObjectPropertyCountToPreserve);
512
return inferSchema(sampleData).toJSONSchema();
613
}

meta_configurator/src/settings/defaultSettingsData.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ export const SETTINGS_DATA_DEFAULT = {
88
hideSchemaEditor: false,
99
hideSettings: false,
1010
performance: {
11-
maxDocumentSizeForValidation: 1000000, // 1 MB
12-
maxDocumentSizeForCursorSynchronization: 1000000, // 1 MB
13-
maximumShownChildrenInGuiEditor: 50,
11+
maxDocumentSizeForValidation: 1024000, // 1 MiB
12+
maxDocumentSizeForCursorSynchronization: 1240000, // 1 MiB
13+
maxDocumentSizeForSchemaInference: 40960, // 40 KiB
14+
minObjectPropertyCountToPreserve: 16, // when large document is trimmed, this is minimum count of object properties to be preserved
15+
maxShownChildrenInGuiEditor: 50,
1416
},
1517
codeEditor: {
1618
fontSize: 14,

meta_configurator/src/settings/settingsSchema.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@ export const SETTINGS_SCHEMA: TopLevelSchema = {
3838
},
3939
performance: {
4040
type: 'object',
41-
required: ['maxDocumentSizeForValidation', 'maxDocumentSizeForCursorSynchronization'],
41+
required: [
42+
'maxDocumentSizeForValidation',
43+
'maxDocumentSizeForCursorSynchronization',
44+
'maxDocumentSizeForSchemaInference',
45+
'minObjectPropertyCountToPreserve',
46+
'maxShownChildrenInGuiEditor',
47+
],
4248
additionalProperties: false,
4349
description: 'Performance related settings belong here.',
4450
properties: {
@@ -56,7 +62,21 @@ export const SETTINGS_SCHEMA: TopLevelSchema = {
5662
default: 1000000, // 1 MB
5763
minimum: 1000,
5864
},
59-
maximumShownChildrenInGuiEditor: {
65+
maxDocumentSizeForSchemaInference: {
66+
type: 'integer',
67+
description:
68+
'The maximum size of the document to infer the schema from in bytes. If the document is larger, a smart algorithm is used to trim the document first and then infer the schema from the smaller, trimmed input document.',
69+
default: 250000, // 250 KB
70+
minimum: 1000,
71+
},
72+
minObjectPropertyCountToPreserve: {
73+
type: 'integer',
74+
description:
75+
'When large documents are trimmed, this is the minimum count of object properties to be preserved. This is used to avoid trimming too much data from objects with many properties. The value can be increased in this setting if in your application more properties are cut than desired during the performance optimization.',
76+
default: 16,
77+
minimum: 16,
78+
},
79+
maxShownChildrenInGuiEditor: {
6080
type: 'integer',
6181
description:
6282
'The maximum amount of child nodes to be shown in the GUI editor per parent node. If the document has more children than this value, those will not be shown in the GUI editor, but still exist in the document and can be edited by other panels.',

meta_configurator/src/settings/settingsTypes.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ export interface SettingsInterfaceRoot {
2121
export interface SettingsInterfacePerformance {
2222
maxDocumentSizeForValidation: number; // in bytes
2323
maxDocumentSizeForCursorSynchronization: number; // in bytes
24-
maximumShownChildrenInGuiEditor: number;
24+
maxDocumentSizeForSchemaInference: number; // in bytes
25+
minObjectPropertyCountToPreserve: number; // when large document is trimmed, this is minimum count of object properties to be preserved
26+
maxShownChildrenInGuiEditor: number;
2527
}
2628

2729
export interface SettingsInterfaceCodeEditor {
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
export function trimDataToMaxSize(
2+
data: any,
3+
maxSizeInKiB: number = 64,
4+
minObjectPropertyCountToPreserve: number = 16
5+
): any {
6+
let n = 64;
7+
8+
// cut data to n entries and check size. If it is not yet reached, divide n by 2 and repeat.
9+
// minimum n is 2
10+
while (true) {
11+
const dataTrimmedArrays = trimDataToNEntriesPerArray(data, n);
12+
// we trim object properties to n*8, because even in the minimum case we want to preserve at least 16 properties (or more, if defined in the parameter minObjectPropertyCountToPreserve).
13+
// properties normally should not be cut at all, except the schema uses patternProperties or additionalProperties and the user has hundreds or millions of same-looking objects
14+
// therefore, trim object properties very conservatively only, but do it if necessary.
15+
const dataTrimmedBoth = trimDataToNPropertiesPerObject(
16+
dataTrimmedArrays,
17+
Math.max(n * 8, minObjectPropertyCountToPreserve)
18+
);
19+
const sizeInBytes = new TextEncoder().encode(JSON.stringify(dataTrimmedBoth)).length;
20+
const sizeInKiB = sizeInBytes / 1024; // convert to KiB
21+
if (sizeInKiB <= maxSizeInKiB || n <= 2) {
22+
return dataTrimmedBoth; // return the cut data if size is within limit or n is too small
23+
}
24+
25+
n = Math.floor(n / 2); // reduce n by half
26+
}
27+
}
28+
29+
function trimDataToNEntriesPerArray(data: any, n: number): any {
30+
// data will be a json object or array with an arbitrary hierarchy and anywhere could be arrays
31+
// we want to cut each array to have only n entries
32+
33+
// check if data is an array. Even then, children could be objects or arrays. Apply same algorithm recursively on each array item
34+
if (Array.isArray(data)) {
35+
const newArray = [];
36+
let i = 0;
37+
for (const item of data) {
38+
// if the array has more than n entries, cut it to n entries
39+
if (i < n) {
40+
i++;
41+
} else {
42+
break;
43+
}
44+
newArray.push(trimDataToNEntriesPerArray(item, n));
45+
}
46+
return newArray;
47+
}
48+
49+
// if data is an object, we need to traverse the object and cut each array to have only 3 entries
50+
if (typeof data === 'object' && data !== null) {
51+
const newObject: any = {};
52+
for (const key in data) {
53+
newObject[key] = trimDataToNEntriesPerArray(data[key], n);
54+
}
55+
return newObject;
56+
}
57+
// if data is not an object or array, return it as is
58+
return data;
59+
}
60+
61+
function trimDataToNPropertiesPerObject(data: any, n: number): any {
62+
// data will be a json object or array with an arbitrary hierarchy and anywhere could be objects
63+
// we want to cut each object to have only n properties
64+
65+
// check if data is an array. Even then, children could be objects or arrays. Apply same algorithm recursively on each array item
66+
if (Array.isArray(data)) {
67+
return data.map(item => trimDataToNPropertiesPerObject(item, n));
68+
}
69+
70+
// if data is an object, we need to traverse the object and cut each object to have only n properties
71+
if (typeof data === 'object' && data !== null) {
72+
const newObject: any = {};
73+
const keys = Object.keys(data);
74+
for (let i = 0; i < Math.min(n, keys.length); i++) {
75+
const key = keys[i];
76+
newObject[key] = trimDataToNPropertiesPerObject(data[key], n);
77+
}
78+
return newObject;
79+
}
80+
// if data is not an object or array, return it as is
81+
return data;
82+
}

0 commit comments

Comments
 (0)