Skip to content

Commit 5302a42

Browse files
authored
Post batch collect processing support (#34)
* move AbstractStep from pth-10 to dpf-02; add post batchcollect step processing * upgrade JDK to 11, spark libraries to 3.4.0 * add test for post-bc steps processing * add post-bc specific limit ctor arg * improve batchcollect ctor args
1 parent 85e1603 commit 5302a42

8 files changed

Lines changed: 306 additions & 77 deletions

File tree

.github/workflows/coverity.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ jobs:
1515
with:
1616
fetch-depth: 0
1717

18-
- name: Set up JDK 8
18+
- name: Set up JDK 11
1919
uses: actions/setup-java@v3
2020
with:
21-
java-version: '8'
21+
java-version: '11'
2222
distribution: 'temurin'
2323
server-id: github
2424
settings-path: ${{ github.workspace }}

.github/workflows/upload_release.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ jobs:
2121
- name: Setup Signing
2222
uses: actions/setup-java@v2
2323
with:
24-
java-version: 8.0.292+10
24+
java-version: 11
2525
distribution: 'adopt'
2626
gpg-private-key: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }}
2727
gpg-passphrase: MAVEN_GPG_PASSPHRASE
2828

2929
- name: Setup GitHub Packages
3030
uses: actions/setup-java@v2
3131
with:
32-
java-version: 8.0.292+10
32+
java-version: 11
3333
distribution: 'adopt'
3434
gpg-passphrase: MAVEN_GPG_PASSPHRASE
3535

@@ -43,7 +43,7 @@ jobs:
4343
- name: Setup Maven Central
4444
uses: actions/setup-java@v2
4545
with:
46-
java-version: 8.0.292+10
46+
java-version: 11
4747
distribution: 'adopt'
4848

4949
server-id: central-sonatype-org

pom.xml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
<properties>
1414
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
1515
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16-
<java.version>1.8</java.version>
16+
<java.version>11</java.version>
1717
<revision>0.0.1</revision>
1818
<changelist>-SNAPSHOT</changelist>
1919
<sha1/>
@@ -41,7 +41,7 @@
4141
<dependency>
4242
<groupId>org.apache.spark</groupId>
4343
<artifactId>spark-core_2.12</artifactId>
44-
<version>2.4.5</version>
44+
<version>3.4.0</version>
4545
<scope>provided</scope>
4646
<exclusions>
4747
<exclusion>
@@ -53,7 +53,7 @@
5353
<dependency>
5454
<groupId>org.apache.spark</groupId>
5555
<artifactId>spark-sql_2.12</artifactId>
56-
<version>2.4.5</version>
56+
<version>3.4.0</version>
5757
<scope>provided</scope>
5858
<exclusions>
5959
<exclusion>
@@ -121,7 +121,7 @@
121121
<goal>jar</goal>
122122
</goals>
123123
<configuration>
124-
<source>8</source>
124+
<source>11</source>
125125
</configuration>
126126
</execution>
127127
</executions>
@@ -174,8 +174,8 @@
174174
<artifactId>maven-compiler-plugin</artifactId>
175175
<version>3.8.1</version>
176176
<configuration>
177-
<source>8</source>
178-
<target>8</target>
177+
<source>11</source>
178+
<target>11</target>
179179
</configuration>
180180
</plugin>
181181
<plugin>
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Teragrep Batch Collect DPF-02
3+
* Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy
4+
*
5+
* This program is free software: you can redistribute it and/or modify
6+
* it under the terms of the GNU Affero General Public License as published by
7+
* the Free Software Foundation, either version 3 of the License, or
8+
* (at your option) any later version.
9+
*
10+
* This program is distributed in the hope that it will be useful,
11+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
* GNU Affero General Public License for more details.
14+
*
15+
* You should have received a copy of the GNU Affero General Public License
16+
* along with this program. If not, see <https://github.com/teragrep/teragrep/blob/main/LICENSE>.
17+
*
18+
*
19+
* Additional permission under GNU Affero General Public License version 3
20+
* section 7
21+
*
22+
* If you modify this Program, or any covered work, by linking or combining it
23+
* with other code, such other code is not for that reason alone subject to any
24+
* of the requirements of the GNU Affero GPL version 3 as long as this Program
25+
* is the same Program as licensed from Suomen Kanuuna Oy without any additional
26+
* modifications.
27+
*
28+
* Supplemented terms under GNU Affero General Public License version 3
29+
* section 7
30+
*
31+
* Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
32+
* versions must be marked as "Modified version of" The Program.
33+
*
34+
* Names of the licensors and authors may not be used for publicity purposes.
35+
*
36+
* No rights are granted for use of trade names, trademarks, or service marks
37+
* which are in The Program if any.
38+
*
39+
* Licensee must indemnify licensors and authors for any liability that these
40+
* contractual assumptions impose on licensors and authors.
41+
*
42+
* To the extent this program is licensed as part of the Commercial versions of
43+
* Teragrep, the applicable Commercial License may apply to this file if you as
44+
* a licensee so wish it.
45+
*/
46+
package com.teragrep.functions.dpf_02;
47+
48+
import org.apache.spark.sql.Dataset;
49+
import org.apache.spark.sql.Row;
50+
import org.apache.spark.sql.streaming.StreamingQueryException;
51+
52+
import java.util.HashSet;
53+
import java.util.Set;
54+
55+
public abstract class AbstractStep {
56+
57+
public enum CommandProperty {
58+
USES_INTERNAL_BATCHCOLLECT, // Command has an internal batch collect, e.g. sort
59+
IGNORE_DEFAULT_SORTING, // Command applies a certain order to the rows
60+
SEQUENTIAL_ONLY, // Works only in Sequential mode (forEachBatch)
61+
AGGREGATE, // If there are multiple aggregate commands, switch to sequential mode is necessary
62+
REQUIRE_PRECEDING_AGGREGATE, // this command requires an aggregate command before it
63+
NO_PRECEDING_AGGREGATE, // command does not allow an aggregate command before it
64+
POST_BATCHCOLLECT, // command should be run post (after) batch collect
65+
}
66+
67+
protected final Set<CommandProperty> properties = new HashSet<>();
68+
protected boolean aggregatesUsedBefore = false;
69+
70+
public boolean hasProperty(CommandProperty prop) {
71+
return properties.contains(prop);
72+
}
73+
74+
private boolean addProperty(CommandProperty prop) {
75+
return properties.add(prop);
76+
}
77+
78+
public void setAggregatesUsedBefore(boolean aggregatesUsedBefore) {
79+
this.aggregatesUsedBefore = aggregatesUsedBefore;
80+
}
81+
82+
public AbstractStep() {
83+
84+
}
85+
86+
/**
87+
* Perform the necessary dataframe operations for the implemented command
88+
*
89+
* @param dataset Dataset to operate on
90+
* @return Dataframe, which has the operations applied
91+
*/
92+
public abstract Dataset<Row> get(Dataset<Row> dataset) throws StreamingQueryException;
93+
}

src/main/java/com/teragrep/functions/dpf_02/BatchCollect.java

Lines changed: 54 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
*/
4848

4949
import org.apache.spark.sql.*;
50+
import org.apache.spark.sql.streaming.StreamingQueryException;
5051
import org.apache.spark.sql.types.StructType;
5152
import org.slf4j.Logger;
5253
import org.slf4j.LoggerFactory;
@@ -57,22 +58,28 @@ public final class BatchCollect extends SortOperation {
5758
private static final Logger LOGGER = LoggerFactory.getLogger(BatchCollect.class);
5859
private Dataset<Row> savedDs = null;
5960
private Dataset<Row> lastRowDs = null;
61+
private Dataset<Row> outputDs = null;
6062
private final String sortColumn;
61-
private final int numberOfRows;
63+
private final int defaultLimit;
64+
private final int postBcLimit;
6265
private StructType inputSchema;
6366
private boolean sortedBySingleColumn = false;
6467

65-
public BatchCollect(String sortColumn, int numberOfRows) {
66-
this(sortColumn, numberOfRows, new ArrayList<>());
68+
public BatchCollect(String sortColumn, int defaultLimit) {
69+
this(sortColumn, defaultLimit, 0, new ArrayList<>());
6770
}
6871

69-
public BatchCollect(String sortColumn, int numberOfRows, List<SortByClause> listOfSortByClauses) {
70-
super(listOfSortByClauses);
72+
public BatchCollect(String sortColumn, int defaultLimit, int postBcLimit) {
73+
this(sortColumn, defaultLimit, postBcLimit, new ArrayList<>());
74+
}
7175

72-
LOGGER.info("Initialized BatchCollect based on column " + sortColumn + " and a limit of " + numberOfRows + " row(s)." +
73-
" SortByClauses included: " + (listOfSortByClauses != null ? listOfSortByClauses.size() : "<null>"));
76+
public BatchCollect(String sortColumn, int defaultLimit, int postBcLimit, List<SortByClause> listOfSortByClauses) {
77+
super(listOfSortByClauses);
78+
LOGGER.info("Initialized BatchCollect based on column <[{}]> and a limit of <[{}]> row(s). SortByClauses included: <[{}]>. Post batchcollect limit of <[{}]> row(s)",
79+
sortColumn, defaultLimit, (listOfSortByClauses != null ? listOfSortByClauses.size() : "null"), postBcLimit);
7480
this.sortColumn = sortColumn;
75-
this.numberOfRows = numberOfRows;
81+
this.defaultLimit = defaultLimit;
82+
this.postBcLimit = postBcLimit;
7683
}
7784

7885
/**
@@ -84,31 +91,35 @@ public BatchCollect(String sortColumn, int numberOfRows, List<SortByClause> list
8491
*/
8592
public Dataset<Row> call(Dataset<Row> df, Long id, boolean skipLimiting) {
8693
Dataset<Row> rv;
87-
if (skipLimiting) {
88-
this.processAggregated(df);
89-
}
90-
else {
91-
this.collect(df, id);
92-
}
94+
this.collect(df, id, Collections.emptyList(), skipLimiting);
9395

9496
if (this.lastRowDs != null) {
95-
rv = this.savedDs.union(this.lastRowDs);
97+
rv = this.outputDs.union(this.lastRowDs);
9698
} else {
97-
rv = this.savedDs;
99+
rv = this.outputDs;
98100
}
99101

100102
return rv;
101103
}
102104

103-
public void collect(Dataset<Row> batchDF, Long batchId) {
105+
public void collect(Dataset<Row> batchDF, Long batchId, List<AbstractStep> postBcSteps, boolean skipLimiting) {
106+
// Apply post-batchcollect limit if steps are present, otherwise use the default.
107+
// limit<=0 means no limit
108+
final int limit;
109+
if (!postBcSteps.isEmpty()) {
110+
limit = this.postBcLimit;
111+
} else {
112+
limit = this.defaultLimit;
113+
}
114+
104115
// check that sortColumn (_time) exists,
105116
// and get the sortColId
106117
// otherwise, no sorting will be done.
107118
if (this.inputSchema == null) {
108119
this.inputSchema = batchDF.schema();
109120
}
110121

111-
if (this.getListOfSortByClauses() == null || this.getListOfSortByClauses().size() < 1) {
122+
if (this.getListOfSortByClauses() == null || this.getListOfSortByClauses().isEmpty()) {
112123
for (String field : this.inputSchema.fieldNames()) {
113124
if (field.equals(this.sortColumn)) {
114125
this.sortedBySingleColumn = true;
@@ -117,38 +128,37 @@ public void collect(Dataset<Row> batchDF, Long batchId) {
117128
}
118129
}
119130

120-
List<Row> collected = orderDataset(batchDF).limit(numberOfRows).collectAsList();
131+
Dataset<Row> orderedDs = orderDataset(batchDF);
132+
if (!skipLimiting && limit > 0) {
133+
orderedDs = orderedDs.limit(limit);
134+
}
135+
List<Row> collected = orderedDs.collectAsList();
121136
Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);
122-
137+
Dataset<Row> current;
123138
if (this.savedDs == null) {
124-
this.savedDs = createdDsFromCollected;
139+
current = createdDsFromCollected;
125140
}
126141
else {
127-
this.savedDs = savedDs.union(createdDsFromCollected);
142+
current = savedDs.union(createdDsFromCollected);
128143
}
129144

130-
this.savedDs = orderDataset(this.savedDs).limit(numberOfRows);
131-
132-
}
133-
134-
// Call this instead of collect to skip limiting (for aggregatesUsed=true)
135-
// TODO remove this
136-
public void processAggregated(Dataset<Row> ds) {
137-
if (this.inputSchema == null) {
138-
this.inputSchema = ds.schema();
145+
current = orderDataset(current);
146+
if (!skipLimiting && limit > 0) {
147+
current = current.limit(limit);
139148
}
140-
141-
List<Row> collected = orderDataset(ds).collectAsList();
142-
Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);
143-
144-
if (this.savedDs == null) {
145-
this.savedDs = createdDsFromCollected;
146-
}
147-
else {
148-
this.savedDs = savedDs.union(createdDsFromCollected);
149+
this.savedDs = current;
150+
151+
// Post batchCollect steps processing
152+
Dataset<Row> rv = current;
153+
for (final AbstractStep step : postBcSteps) {
154+
try {
155+
rv = step.get(rv);
156+
} catch (StreamingQueryException e) {
157+
throw new IllegalStateException("Exception occurred while running post-batchcollect steps: ", e);
158+
}
149159
}
150160

151-
this.savedDs = orderDataset(this.savedDs);
161+
this.outputDs = rv;
152162
}
153163

154164
private Dataset<Row> orderDataset(Dataset<Row> ds) {
@@ -162,16 +172,17 @@ private Dataset<Row> orderDataset(Dataset<Row> ds) {
162172
public Dataset<Row> getCollectedAsDataframe() {
163173
Dataset<Row> rv;
164174
if (this.lastRowDs != null) {
165-
rv = this.savedDs.union(this.lastRowDs);
175+
rv = this.outputDs.union(this.lastRowDs);
166176
} else {
167-
rv = this.savedDs;
177+
rv = this.outputDs;
168178
}
169179
return rv;
170180
}
171181

172182
public void clear() {
173183
LOGGER.info("dpf_02 cleared");
174184
this.savedDs = null;
185+
this.outputDs = null;
175186
this.lastRowDs = null;
176187
this.inputSchema = null;
177188
}

0 commit comments

Comments
 (0)