Skip to content

Commit d612f67

Browse files
eemhukortemik
andauthored
Add post batchcollect mode and dedup implemented using flatMap (#636)
* move abstractStep to dpf_02, add aggregate after dedup test, add post batchcollect mode * dedup: add DedupState state object, change DedupStep to use flatMapGroupsWithState, fix maxDuplicates default value, remove AbstractDedupStep * convert internal batch collect commands to post batchcollect commands * remove commented-out code from SortStep * remove commented-out code from AddtotalsStep * disable dedup consecutive parameter test as it is not implemented * Upgrade JDK8 to JDK11 * remove unused batchcollects * add 'dpl.pth_10.postbc.limit.size' config item to configure post-batchcollect limit size * don't generate empty timechart buckets if post bc mode is used * apply spotless * Update dpf_02 to 4.0.0 --------- Co-authored-by: Mikko Kortelainen <[email protected]>
1 parent e2af26a commit d612f67

76 files changed

Lines changed: 299 additions & 474 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
- name: Setup Java
2020
uses: actions/setup-java@v4
2121
with:
22-
java-version: 8.0.292+10
22+
java-version: 11
2323
distribution: 'adopt'
2424

2525
- name: Install test dependencies

.github/workflows/coverity.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
- name: Setup Maven Central
2626
uses: actions/setup-java@v4
2727
with:
28-
java-version: 8.0.292+10
28+
java-version: 11
2929
distribution: 'adopt'
3030

3131
- name: Cache Local Maven Repository

.github/workflows/upload_release_github_attachment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
- name: Setup Java
2626
uses: actions/setup-java@v4
2727
with:
28-
java-version: 8.0.292+10
28+
java-version: 11
2929
distribution: 'adopt'
3030

3131
- name: Install test dependencies

.github/workflows/upload_release_github_packages.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ jobs:
2525
- name: Setup Signing
2626
uses: actions/setup-java@v4
2727
with:
28-
java-version: 8.0.292+10
28+
java-version: 11
2929
distribution: 'adopt'
3030

3131
- name: Setup GitHub Packages
3232
uses: actions/setup-java@v4
3333
with:
34-
java-version: 8.0.292+10
34+
java-version: 11
3535
distribution: 'adopt'
3636

3737
- name: Install test dependencies

pom.xml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@
4141
</scm>
4242
<properties>
4343
<changelist>-SNAPSHOT</changelist>
44-
<java.version>1.8</java.version>
44+
<java.version>11</java.version>
4545
<junit.jupiter.version>5.7.1</junit.jupiter.version>
4646
<maven.compiler.source>1.8</maven.compiler.source>
4747
<maven.compiler.target>1.8</maven.compiler.target>
4848
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
4949
<revision>0.0.1</revision>
5050
<sha1></sha1>
51-
<teragrep.dpf_02.version>3.0.0</teragrep.dpf_02.version>
51+
<teragrep.dpf_02.version>4.0.0</teragrep.dpf_02.version>
5252
<teragrep.dpf_03.version>11.0.1</teragrep.dpf_03.version>
5353
<teragrep.jpr_01.version>3.1.1</teragrep.jpr_01.version>
5454
<teragrep.jue_01.version>0.4.3</teragrep.jue_01.version>
@@ -334,7 +334,7 @@
334334
<version>3.2.5</version>
335335
</requireMavenVersion>
336336
<requireJavaVersion>
337-
<version>[1.8,1.9)</version>
337+
<version>[11,12)</version>
338338
</requireJavaVersion>
339339
<banDynamicVersions></banDynamicVersions>
340340
<requirePluginVersions>
@@ -373,7 +373,7 @@
373373
<goal>jar</goal>
374374
</goals>
375375
<configuration>
376-
<source>8</source>
376+
<source>11</source>
377377
</configuration>
378378
</execution>
379379
</executions>
@@ -437,8 +437,8 @@
437437
<artifactId>maven-compiler-plugin</artifactId>
438438
<version>3.8.1</version>
439439
<configuration>
440-
<source>8</source>
441-
<target>8</target>
440+
<source>11</source>
441+
<target>11</target>
442442
</configuration>
443443
</plugin>
444444
<plugin>

src/main/java/com/teragrep/pth10/ast/DPLParserCatalystContext.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
*/
4646
package com.teragrep.pth10.ast;
4747

48-
import com.teragrep.pth10.steps.AbstractStep;
48+
import com.teragrep.functions.dpf_02.AbstractStep;
4949
import com.teragrep.pth10.steps.Flushable;
5050
import com.typesafe.config.Config;
5151
import org.apache.spark.sql.Dataset;
@@ -209,6 +209,16 @@ public Integer getDplRecallSize() {
209209
return dplRecallSize;
210210
}
211211

212+
public Integer postBcLimitSize() {
213+
if (config != null && config.hasPath("dpl.pth_10.postbc.limit.size")) {
214+
return config.getInt("dpl.pth_10.postbc.limit.size");
215+
}
216+
else {
217+
// default to no limit if no config available
218+
return 0;
219+
}
220+
}
221+
212222
private String baseUrl = null;
213223
private String paragraphUrl = null;
214224
private String notebookUrl = null;

src/main/java/com/teragrep/pth10/ast/DPLParserCatalystVisitor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@
4545
*/
4646
package com.teragrep.pth10.ast;
4747

48+
import com.teragrep.functions.dpf_02.AbstractStep;
4849
import com.teragrep.pth10.ast.bo.*;
4950
import com.teragrep.pth10.ast.bo.Token.Type;
5051
import com.teragrep.pth10.ast.commands.logicalstatement.LogicalStatementCatalyst;
5152
import com.teragrep.pth10.ast.commands.logicalstatement.LogicalStatementXML;
5253
import com.teragrep.pth10.ast.commands.logicalstatement.TimeStatement;
5354
import com.teragrep.pth10.ast.commands.transformstatement.TransformStatement;
54-
import com.teragrep.pth10.steps.AbstractStep;
5555
import com.teragrep.pth10.steps.EmptyDataframeStep;
5656
import com.teragrep.pth10.steps.logicalCatalyst.LogicalCatalystStep;
5757
import com.teragrep.pth10.steps.subsearch.SubsearchStep;

src/main/java/com/teragrep/pth10/ast/StepList.java

Lines changed: 52 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@
4545
*/
4646
package com.teragrep.pth10.ast;
4747

48+
import com.teragrep.functions.dpf_02.AbstractStep;
4849
import com.teragrep.functions.dpf_02.BatchCollect;
49-
import com.teragrep.pth10.steps.AbstractStep;
5050
import org.apache.spark.api.java.function.VoidFunction2;
5151
import org.apache.spark.sql.Dataset;
5252
import org.apache.spark.sql.Row;
@@ -69,17 +69,20 @@
6969

7070
public class StepList implements VoidFunction2<Dataset<Row>, Long> {
7171

72+
private enum BreakpointType {
73+
SEQUENTIAL, POST_BC
74+
}
75+
7276
private static final Logger LOGGER = LoggerFactory.getLogger(StepList.class);
7377
private final List<AbstractStep> list;
74-
private int breakpoint = -1;
78+
private final Map<BreakpointType, Integer> breakpoints;
7579
private int aggregateCount = 0;
7680
private boolean useInternalBatchCollect = false;
7781
private boolean ignoreDefaultSorting = false;
7882

7983
private OutputMode outputMode = OutputMode.Append();
8084
private Consumer<Dataset<Row>> batchHandler = null; // for UI
8185
private BatchCollect batchCollect; // standard batchCollect, used before sending batch event
82-
private BatchCollect sequentialModeBatchCollect; // used if in append mode and in sequential, to allow aggregates in sequential mode
8386
private DPLParserCatalystVisitor catVisitor;
8487

8588
public void setBatchCollect(BatchCollect batchCollect) {
@@ -90,6 +93,10 @@ public void setBatchHandler(Consumer<Dataset<Row>> batchHandler) {
9093
this.batchHandler = batchHandler;
9194
}
9295

96+
public BatchCollect batchCollect() {
97+
return this.batchCollect;
98+
}
99+
93100
@Deprecated
94101
public Consumer<Dataset<Row>> getBatchHandler() {
95102
return batchHandler;
@@ -102,8 +109,12 @@ public void setCatVisitor(DPLParserCatalystVisitor catVisitor) {
102109
public StepList(DPLParserCatalystVisitor catVisitor) {
103110
this.list = new ArrayList<>();
104111
this.catVisitor = catVisitor;
105-
this.batchCollect = new BatchCollect("_time", catVisitor.getCatalystContext().getDplRecallSize());
106-
this.sequentialModeBatchCollect = new BatchCollect(null, catVisitor.getCatalystContext().getDplRecallSize());
112+
this.batchCollect = new BatchCollect(
113+
"_time",
114+
catVisitor.getCatalystContext().getDplRecallSize(),
115+
catVisitor.getCatalystContext().postBcLimitSize()
116+
);
117+
this.breakpoints = new HashMap<>();
107118
}
108119

109120
/**
@@ -180,7 +191,11 @@ public Dataset<Row> executeSubsearch(Dataset<Row> ds) throws StreamingQueryExcep
180191
private DataStreamWriter<Row> executeFromStep(int fromStepIndex, Dataset<Row> ds) throws StreamingQueryException {
181192
for (int i = fromStepIndex; i < this.list.size(); i++) {
182193
AbstractStep step = this.list.get(i);
183-
if (i == breakpoint) {
194+
if (
195+
(breakpoints.containsKey(BreakpointType.SEQUENTIAL) && breakpoints
196+
.get(BreakpointType.SEQUENTIAL) == i) || (breakpoints.containsKey(BreakpointType.POST_BC)
197+
&& breakpoints.get(BreakpointType.POST_BC) == i)
198+
) {
184199
// Switch to sequential; aka run the step inside forEachBatch
185200
LOGGER.debug("breakpoint encountered at index <{}>", i);
186201

@@ -193,12 +208,12 @@ private DataStreamWriter<Row> executeFromStep(int fromStepIndex, Dataset<Row> ds
193208
}
194209

195210
private Dataset<Row> executeInBatch(Dataset<Row> ds) throws StreamingQueryException {
196-
if (breakpoint == -1) { // no sequential ops
211+
if (!breakpoints.containsKey(BreakpointType.SEQUENTIAL)) { // no sequential ops
197212
return ds;
198213
}
199214

200215
// sequential ops found
201-
for (int i = breakpoint; i < this.list.size(); i++) {
216+
for (int i = breakpoints.get(BreakpointType.SEQUENTIAL); i < this.list.size(); i++) {
202217
AbstractStep step = this.list.get(i);
203218
LOGGER.info("Executing seq ops in batch: <{}>", step.toString());
204219
ds = step.get(ds);
@@ -216,7 +231,7 @@ private void analyze() {
216231

217232
for (int i = 0; i < this.list.size(); i++) {
218233
AbstractStep step = this.list.get(i);
219-
234+
LOGGER.info("Analyzing step: <{}>", step.toString());
220235
step.setAggregatesUsedBefore(aggregateCount > 0);
221236

222237
if (step.hasProperty(AbstractStep.CommandProperty.USES_INTERNAL_BATCHCOLLECT)) {
@@ -228,7 +243,11 @@ private void analyze() {
228243
if (step.hasProperty(AbstractStep.CommandProperty.IGNORE_DEFAULT_SORTING)) {
229244
LOGGER.info("[Analyze] Ignore default sorting: <{}>", step);
230245
this.ignoreDefaultSorting = true;
231-
this.batchCollect = new BatchCollect(null, catVisitor.getDPLRecallSize());
246+
this.batchCollect = new BatchCollect(
247+
null,
248+
catVisitor.getDPLRecallSize(),
249+
catVisitor.getCatalystContext().postBcLimitSize()
250+
);
232251
}
233252

234253
if (step.hasProperty(AbstractStep.CommandProperty.REQUIRE_PRECEDING_AGGREGATE)) {
@@ -246,20 +265,29 @@ private void analyze() {
246265
if (step.hasProperty(AbstractStep.CommandProperty.SEQUENTIAL_ONLY)) {
247266
LOGGER.info("[Analyze] Sequential only command: <{}>", step);
248267
// set the breakpoint just once
249-
if (breakpoint == -1) {
250-
breakpoint = i;
268+
if (!breakpoints.containsKey(BreakpointType.SEQUENTIAL)) {
269+
breakpoints.put(BreakpointType.SEQUENTIAL, i);
251270
}
252271
}
253272
else if (step.hasProperty(AbstractStep.CommandProperty.AGGREGATE)) {
254273
LOGGER.info("[Analyze] Aggregate command: <{}>", step);
255274
aggregateCount++;
256275

257276
// set the breakpoint just once
258-
if (aggregateCount > 0 && breakpoint == -1) {
259-
breakpoint = i + 1;
277+
if (
278+
aggregateCount > 0 && !breakpoints.containsKey(BreakpointType.SEQUENTIAL)
279+
&& !breakpoints.containsKey(BreakpointType.POST_BC)
280+
) {
281+
breakpoints.put(BreakpointType.SEQUENTIAL, i + 1);
260282
outputMode = OutputMode.Complete();
261283
}
262284
}
285+
else if (step.hasProperty(AbstractStep.CommandProperty.POST_BATCHCOLLECT)) {
286+
if (!breakpoints.containsKey(BreakpointType.POST_BC)) {
287+
LOGGER.info("[Analyze] Post batch collect command: <{}>", step);
288+
breakpoints.put(BreakpointType.POST_BC, i);
289+
}
290+
}
263291
}
264292
}
265293

@@ -282,7 +310,11 @@ else if (this.batchCollect == null) {
282310
}
283311
else {
284312
LOGGER.info("------------------ Aggregates NOT USED (before seq. switch), using batchCollect!");
285-
this.batchCollect.collect(ds, id);
313+
int index = this.list.size();
314+
if (breakpoints.containsKey(BreakpointType.POST_BC)) {
315+
index = breakpoints.get(BreakpointType.POST_BC);
316+
}
317+
this.batchCollect.collect(ds, id, this.list.subList(index, this.list.size()), false);
286318
this.batchHandler.accept(batchCollect.getCollectedAsDataframe());
287319
}
288320
}
@@ -292,7 +324,10 @@ public void call(Dataset<Row> batchDF, Long batchId) throws StreamingQueryExcept
292324
LOGGER.info("StepList batch processing received a new batch <{}>", batchId);
293325

294326
// timechart empty buckets
295-
if (catVisitor.getCatalystContext().getTimeChartSpanSeconds() != null) {
327+
if (
328+
catVisitor.getCatalystContext().getTimeChartSpanSeconds() != null
329+
&& !breakpoints.containsKey(BreakpointType.POST_BC)
330+
) {
296331
// create spans
297332
final long min = catVisitor.getCatalystContext().getDplMinimumEarliest();
298333
final long max = catVisitor.getCatalystContext().getDplMaximumLatest();
@@ -346,11 +381,7 @@ else if (dataType == DataTypes.FloatType) {
346381

347382
// Continue sub list of steps execution, if necessary
348383
if (!this.list.isEmpty()) {
349-
LOGGER
350-
.info(
351-
"StepList batch processing - Continuing execution to next ops after breakpoint index: <{}>",
352-
breakpoint
353-
);
384+
LOGGER.info("StepList batch processing - Continuing execution to next ops after breakpoint index");
354385

355386
Dataset<Row> ret = this.executeInBatch(batchDF);
356387

src/main/java/com/teragrep/pth10/ast/bo/StepListNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
*/
4646
package com.teragrep.pth10.ast.bo;
4747

48-
import com.teragrep.pth10.steps.AbstractStep;
48+
import com.teragrep.functions.dpf_02.AbstractStep;
4949

5050
import java.util.ArrayList;
5151
import java.util.List;

src/main/java/com/teragrep/pth10/ast/bo/StepNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
*/
4646
package com.teragrep.pth10.ast.bo;
4747

48-
import com.teragrep.pth10.steps.AbstractStep;
48+
import com.teragrep.functions.dpf_02.AbstractStep;
4949

5050
public class StepNode extends Node {
5151

0 commit comments

Comments
 (0)