teragrep
diff --git a/‎.github/workflows/coverity.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/coverity.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/upload_release.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/upload_release.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pom.xml‎
Lines changed: 6 additions & 6 deletions b/‎pom.xml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/main/java/com/teragrep/functions/dpf_02/AbstractStep.java‎
Lines changed: 93 additions & 0 deletions b/‎src/main/java/com/teragrep/functions/dpf_02/AbstractStep.java‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎src/main/java/com/teragrep/functions/dpf_02/BatchCollect.java‎
Lines changed: 54 additions & 43 deletions b/‎src/main/java/com/teragrep/functions/dpf_02/BatchCollect.java‎
Lines changed: 54 additions & 43 deletions
@@ -15,10 +15,10 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Set up JDK 8
+      - name: Set up JDK 11
         uses: actions/setup-java@v3
         with:
-          java-version: '8'
+          java-version: '11'
           distribution: 'temurin'
           server-id: github
           settings-path: ${{ github.workspace }}
 
@@ -21,15 +21,15 @@ jobs:
       - name: Setup Signing
         uses: actions/setup-java@v2
         with:
-          java-version: 8.0.292+10
+          java-version: 11
           distribution: 'adopt'
           gpg-private-key: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }}
           gpg-passphrase: MAVEN_GPG_PASSPHRASE
 
       - name: Setup GitHub Packages
         uses: actions/setup-java@v2
         with:
-          java-version: 8.0.292+10
+          java-version: 11
           distribution: 'adopt'
           gpg-passphrase: MAVEN_GPG_PASSPHRASE
 
@@ -43,7 +43,7 @@ jobs:
       - name: Setup Maven Central
         uses: actions/setup-java@v2
         with:
-          java-version: 8.0.292+10
+          java-version: 11
           distribution: 'adopt'
 
           server-id: central-sonatype-org
 
@@ -13,7 +13,7 @@
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <java.version>1.8</java.version>
+    <java.version>11</java.version>
     <revision>0.0.1</revision>
     <changelist>-SNAPSHOT</changelist>
     <sha1/>
@@ -41,7 +41,7 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.12</artifactId>
-      <version>2.4.5</version>
+      <version>3.4.0</version>
       <scope>provided</scope>
       <exclusions>
         <exclusion>
@@ -53,7 +53,7 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_2.12</artifactId>
-      <version>2.4.5</version>
+      <version>3.4.0</version>
       <scope>provided</scope>
       <exclusions>
         <exclusion>
@@ -121,7 +121,7 @@
               <goal>jar</goal>
             </goals>
             <configuration>
-              <source>8</source>
+              <source>11</source>
             </configuration>
           </execution>
         </executions>
@@ -174,8 +174,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.8.1</version>
         <configuration>
-          <source>8</source>
-          <target>8</target>
+          <source>11</source>
+          <target>11</target>
         </configuration>
       </plugin>
       <plugin>
 
@@ -0,0 +1,93 @@
+/*
+ * Teragrep Batch Collect DPF-02
+ * Copyright (C) 2019, 2020, 2021, 2022  Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://github.com/teragrep/teragrep/blob/main/LICENSE>.
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.functions.dpf_02;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.streaming.StreamingQueryException;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public abstract class AbstractStep {
+
+    public enum CommandProperty {
+        USES_INTERNAL_BATCHCOLLECT, // Command has an internal batch collect, e.g. sort
+        IGNORE_DEFAULT_SORTING, // Command applies a certain order to the rows
+        SEQUENTIAL_ONLY, // Works only in Sequential mode (forEachBatch)
+        AGGREGATE, // If there are multiple aggregate commands, switch to sequential mode is necessary
+        REQUIRE_PRECEDING_AGGREGATE, // this command requires an aggregate command before it
+        NO_PRECEDING_AGGREGATE, // command does not allow an aggregate command before it
+        POST_BATCHCOLLECT, // command should be run post (after) batch collect
+    }
+
+    protected final Set<CommandProperty> properties = new HashSet<>();
+    protected boolean aggregatesUsedBefore = false;
+
+    public boolean hasProperty(CommandProperty prop) {
+        return properties.contains(prop);
+    }
+
+    private boolean addProperty(CommandProperty prop) {
+        return properties.add(prop);
+    }
+
+    public void setAggregatesUsedBefore(boolean aggregatesUsedBefore) {
+        this.aggregatesUsedBefore = aggregatesUsedBefore;
+    }
+
+    public AbstractStep() {
+
+    }
+
+    /**
+     * Perform the necessary dataframe operations for the implemented command
+     * 
+     * @param dataset Dataset to operate on
+     * @return Dataframe, which has the operations applied
+     */
+    public abstract Dataset<Row> get(Dataset<Row> dataset) throws StreamingQueryException;
+}
@@ -47,6 +47,7 @@
  */
 
 import org.apache.spark.sql.*;
+import org.apache.spark.sql.streaming.StreamingQueryException;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -57,22 +58,28 @@ public final class BatchCollect extends SortOperation {
     private static final Logger LOGGER = LoggerFactory.getLogger(BatchCollect.class);
     private Dataset<Row> savedDs = null;
     private Dataset<Row> lastRowDs = null;
+    private Dataset<Row> outputDs = null;
     private final String sortColumn;
-    private final int numberOfRows;
+    private final int defaultLimit;
+    private final int postBcLimit;
     private StructType inputSchema;
     private boolean sortedBySingleColumn = false;
 
-    public BatchCollect(String sortColumn, int numberOfRows) {
-        this(sortColumn, numberOfRows, new ArrayList<>());
+    public BatchCollect(String sortColumn, int defaultLimit) {
+        this(sortColumn, defaultLimit, 0, new ArrayList<>());
     }
 
-    public BatchCollect(String sortColumn, int numberOfRows, List<SortByClause> listOfSortByClauses) {
-        super(listOfSortByClauses);
+    public BatchCollect(String sortColumn, int defaultLimit, int postBcLimit) {
+        this(sortColumn, defaultLimit, postBcLimit, new ArrayList<>());
+    }
 
-        LOGGER.info("Initialized BatchCollect based on column " + sortColumn + " and a limit of " + numberOfRows + " row(s)." +
-                " SortByClauses included: " + (listOfSortByClauses != null ? listOfSortByClauses.size() : "<null>"));
+    public BatchCollect(String sortColumn, int defaultLimit, int postBcLimit, List<SortByClause> listOfSortByClauses) {
+        super(listOfSortByClauses);
+        LOGGER.info("Initialized BatchCollect based on column <[{}]> and a limit of <[{}]> row(s). SortByClauses included: <[{}]>. Post batchcollect limit of <[{}]> row(s)",
+                sortColumn, defaultLimit, (listOfSortByClauses != null ? listOfSortByClauses.size() : "null"), postBcLimit);
         this.sortColumn = sortColumn;
-        this.numberOfRows = numberOfRows;
+        this.defaultLimit = defaultLimit;
+        this.postBcLimit = postBcLimit;
     }
 
     /**
@@ -84,31 +91,35 @@ public BatchCollect(String sortColumn, int numberOfRows, List<SortByClause> list
      */
     public Dataset<Row> call(Dataset<Row> df, Long id, boolean skipLimiting) {
         Dataset<Row> rv;
-        if (skipLimiting) {
-            this.processAggregated(df);
-        }
-        else {
-            this.collect(df, id);
-        }
+        this.collect(df, id, Collections.emptyList(), skipLimiting);
 
         if (this.lastRowDs != null) {
-            rv = this.savedDs.union(this.lastRowDs);
+            rv = this.outputDs.union(this.lastRowDs);
         } else {
-            rv = this.savedDs;
+            rv = this.outputDs;
         }
 
         return rv;
     }
 
-    public void collect(Dataset<Row> batchDF, Long batchId) {
+    public void collect(Dataset<Row> batchDF, Long batchId, List<AbstractStep> postBcSteps, boolean skipLimiting) {
+        // Apply post-batchcollect limit if steps are present, otherwise use the default.
+        // limit<=0 means no limit
+        final int limit;
+        if (!postBcSteps.isEmpty()) {
+            limit = this.postBcLimit;
+        } else {
+            limit = this.defaultLimit;
+        }
+
         // check that sortColumn (_time) exists,
         // and get the sortColId
         // otherwise, no sorting will be done.
         if (this.inputSchema == null) {
             this.inputSchema = batchDF.schema();
         }
 
-        if (this.getListOfSortByClauses() == null || this.getListOfSortByClauses().size() < 1) {
+        if (this.getListOfSortByClauses() == null || this.getListOfSortByClauses().isEmpty()) {
             for (String field : this.inputSchema.fieldNames()) {
                 if (field.equals(this.sortColumn)) {
                     this.sortedBySingleColumn = true;
@@ -117,38 +128,37 @@ public void collect(Dataset<Row> batchDF, Long batchId) {
             }
         }
 
-        List<Row> collected = orderDataset(batchDF).limit(numberOfRows).collectAsList();
+        Dataset<Row> orderedDs = orderDataset(batchDF);
+        if (!skipLimiting && limit > 0) {
+            orderedDs = orderedDs.limit(limit);
+        }
+        List<Row> collected = orderedDs.collectAsList();
         Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);
-
+        Dataset<Row> current;
         if (this.savedDs == null) {
-            this.savedDs = createdDsFromCollected;
+            current = createdDsFromCollected;
         }
         else {
-            this.savedDs = savedDs.union(createdDsFromCollected);
+            current = savedDs.union(createdDsFromCollected);
         }
 
-        this.savedDs = orderDataset(this.savedDs).limit(numberOfRows);
-
-    }
-
-    // Call this instead of collect to skip limiting (for aggregatesUsed=true)
-    // TODO remove this
-    public void processAggregated(Dataset<Row> ds) {
-        if (this.inputSchema == null) {
-            this.inputSchema = ds.schema();
+        current = orderDataset(current);
+        if (!skipLimiting && limit > 0) {
+            current = current.limit(limit);
         }
-
-        List<Row> collected = orderDataset(ds).collectAsList();
-        Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);
-
-        if (this.savedDs == null) {
-            this.savedDs = createdDsFromCollected;
-        }
-        else {
-            this.savedDs = savedDs.union(createdDsFromCollected);
+        this.savedDs = current;
+
+        // Post batchCollect steps processing
+        Dataset<Row> rv = current;
+        for (final AbstractStep step : postBcSteps) {
+            try {
+                rv = step.get(rv);
+            } catch (StreamingQueryException e) {
+                throw new IllegalStateException("Exception occurred while running post-batchcollect steps: ", e);
+            }
         }
 
-        this.savedDs = orderDataset(this.savedDs);
+        this.outputDs = rv;
     }
 
     private Dataset<Row> orderDataset(Dataset<Row> ds) {
@@ -162,16 +172,17 @@ private Dataset<Row> orderDataset(Dataset<Row> ds) {
     public Dataset<Row> getCollectedAsDataframe() {
         Dataset<Row> rv;
         if (this.lastRowDs != null) {
-            rv = this.savedDs.union(this.lastRowDs);
+           rv = this.outputDs.union(this.lastRowDs);
         } else {
-            rv = this.savedDs;
+            rv = this.outputDs;
         }
         return rv;
     }
 
     public void clear() {
         LOGGER.info("dpf_02 cleared");
         this.savedDs = null;
+        this.outputDs = null;
         this.lastRowDs = null;
         this.inputSchema = null;
     }