Fix BatchCollect limiting the dataset even if skipLimiting is set to true (#23)

51-code · web-flow · commit 85e160376153 · 2024-08-14T17:50:48.000+03:00
* Fix limiting the dataset even if skipLimiting is set to true

* Remove getCollected(), fix BatchCollect tests
diff --git a/src/main/java/com/teragrep/functions/dpf_02/BatchCollect.java b/src/main/java/com/teragrep/functions/dpf_02/BatchCollect.java
@@ -138,7 +138,7 @@ public void processAggregated(Dataset<Row> ds) {
             this.inputSchema = ds.schema();
         }
 
-        List<Row> collected = orderDataset(ds).limit(numberOfRows).collectAsList();
+        List<Row> collected = orderDataset(ds).collectAsList();
         Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);
 
         if (this.savedDs == null) {
@@ -148,7 +148,7 @@ public void processAggregated(Dataset<Row> ds) {
             this.savedDs = savedDs.union(createdDsFromCollected);
         }
 
-        this.savedDs = orderDataset(this.savedDs).limit(numberOfRows);
+        this.savedDs = orderDataset(this.savedDs);
     }
 
     private Dataset<Row> orderDataset(Dataset<Row> ds) {
@@ -159,11 +159,6 @@ private Dataset<Row> orderDataset(Dataset<Row> ds) {
         }
     }
 
-    // TODO: Remove
-    public List<Row> getCollected() {
-        return getCollectedAsDataframe().collectAsList();
-    }
-
     public Dataset<Row> getCollectedAsDataframe() {
         Dataset<Row> rv;
         if (this.lastRowDs != null) {
diff --git a/src/test/java/BatchCollectTest.java b/src/test/java/BatchCollectTest.java
@@ -51,7 +51,6 @@
 import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.execution.streaming.MemoryStream;
 import org.apache.spark.sql.streaming.StreamingQuery;
-import org.apache.spark.sql.streaming.StreamingQueryException;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.MetadataBuilder;
 import org.apache.spark.sql.types.StructField;
@@ -66,9 +65,6 @@
 import java.time.LocalDateTime;
 import java.time.ZoneOffset;
 import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.TreeMap;
 
 public class BatchCollectTest {
 
@@ -87,11 +83,10 @@ public class BatchCollectTest {
                     new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build())
             }
     );
-
-    //@Test
-    public void testCollect() throws StreamingQueryException, InterruptedException {
-
-        SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate();
+    
+    @Test
+    public void testCollectAsDataframe() {
+    	SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate();
         SQLContext sqlContext = sparkSession.sqlContext();
 
         sparkSession.sparkContext().setLogLevel("ERROR");
@@ -102,13 +97,11 @@ public void testCollect() throws StreamingQueryException, InterruptedException {
 
         BatchCollect batchCollect = new BatchCollect("_time", 100, null);
         Dataset<Row> rowDataset = rowMemoryStream.toDF();
-        StreamingQuery streamingQuery = startStream(rowDataset, batchCollect);
+        StreamingQuery streamingQuery = startStream(rowDataset, batchCollect, false);
 
         long run = 0;
-        long counter = 1;
+        long counter = 0;
         while (streamingQuery.isActive()) {
-            //System.out.println(batchCollect.getCollected().size());
-
             Timestamp time = Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.UTC));
             if (run == 3) {
                 // make run 3 to be latest always
@@ -136,31 +129,32 @@ public void testCollect() throws StreamingQueryException, InterruptedException {
                 counter = 0;
             }
             counter++;
-            streamingQuery.processAllAvailable();
 
             if (run == 10) {
                 // 10 runs only
                 // wait until the source feeds them all?
                 // TODO there must be a better way?
-//                streamingQuery.processAllAvailable();
-                streamingQuery.stop();
-				streamingQuery.awaitTermination();
+                streamingQuery.processAllAvailable();
+				streamingQuery.stop();
+				Assertions.assertDoesNotThrow(() -> streamingQuery.awaitTermination());
             }
         }
 
-
-        LinkedList<Integer> runs = new LinkedList<>();
-        runs.add(3);
-        runs.add(6);
-        runs.add(7);
-        runs.add(8);
-        runs.add(9);
-        verifyRuns(batchCollect, runs);
+        Dataset<Row> collectedAsDF = batchCollect.getCollectedAsDataframe();
+        Assertions.assertEquals(100, collectedAsDF.count());
+
+        // assert that batches are correct (the newest 100 rows of data)
+        // batch number 3 is the newest in the test, others are in the order of creation
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("3")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("6")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("7")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("8")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("9")).count());
     }
-    
+
     @Test
-    public void testCollectAsDataframe() throws StreamingQueryException, InterruptedException {
-    	SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate();
+    public void testSkipLimiting() {
+        SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate();
         SQLContext sqlContext = sparkSession.sqlContext();
 
         sparkSession.sparkContext().setLogLevel("ERROR");
@@ -169,19 +163,25 @@ public void testCollectAsDataframe() throws StreamingQueryException, Interrupted
         MemoryStream<Row> rowMemoryStream =
                 new MemoryStream<>(1, sqlContext, encoder);
 
-        BatchCollect batchCollect = new BatchCollect("_time", 100, null);
+        BatchCollect batchCollect = new BatchCollect("_time", 5, new ArrayList<>());
         Dataset<Row> rowDataset = rowMemoryStream.toDF();
-        StreamingQuery streamingQuery = startStream(rowDataset, batchCollect);
+
+        // Skip limiting here
+        StreamingQuery streamingQuery = startStream(rowDataset, batchCollect, true);
 
         long run = 0;
         long counter = 0;
         while (streamingQuery.isActive()) {
-            //System.out.println(batchCollect.getCollected().size());
-
             Timestamp time = Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.UTC));
             if (run == 3) {
                 // make run 3 to be latest always
                 time = Timestamp.valueOf(LocalDateTime.ofInstant(Instant.ofEpochSecond(13851486065L+counter), ZoneOffset.UTC));
+            } else if (run == 10) {
+                // 10 runs only
+                streamingQuery.processAllAvailable();
+                streamingQuery.stop();
+                Assertions.assertDoesNotThrow(() -> streamingQuery.awaitTermination());
+                break;
             }
 
             rowMemoryStream.addData(
@@ -199,27 +199,31 @@ public void testCollectAsDataframe() throws StreamingQueryException, Interrupted
                     )
             );
 
+            counter++;
+
             // create 20 events for 10 runs
             if (counter == 20) {
                 run++;
                 counter = 0;
             }
-            counter++;
-
-            if (run == 10) {
-                // 10 runs only
-                // wait until the source feeds them all?
-                // TODO there must be a better way?
-                streamingQuery.processAllAvailable();
-				streamingQuery.stop();
-				streamingQuery.awaitTermination();
-            }
         }
-        
+
         Dataset<Row> collectedAsDF = batchCollect.getCollectedAsDataframe();
-        collectedAsDF.show(5, true);
-        Assertions.assertTrue(collectedAsDF instanceof Dataset);
-        //Assertions.assertEquals(200, collectedAsDF.count());
+
+        // all the rows in the dataset, the limit of 5 rows is therefore not applied
+        Assertions.assertEquals(200, collectedAsDF.count());
+
+        // assert that batches are correct (all the rows, 10 batches)
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("0")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("1")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("2")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("3")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("4")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("5")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("6")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("7")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("8")).count());
+        Assertions.assertEquals(20, collectedAsDF.filter(functions.col("partition").equalTo("9")).count());
     }
 
     private Seq<Row> makeRows(Timestamp _time,
@@ -255,46 +259,18 @@ private Seq<Row> makeRows(Timestamp _time,
     }
 
 
-    private StreamingQuery startStream(Dataset<Row> rowDataset, BatchCollect batchCollect) {
+    private StreamingQuery startStream(Dataset<Row> rowDataset, BatchCollect batchCollect, boolean skipLimiting) {
         return rowDataset
                 .writeStream()
                 .foreachBatch(
                         new VoidFunction2<Dataset<Row>, Long>() {
                             @Override
-                            public void call(Dataset<Row> batchDF, Long batchId) throws Exception {
-                                batchCollect.collect(batchDF, batchId);
+                            public void call(Dataset<Row> batchDF, Long batchId) {
+                                batchCollect.call(batchDF, batchId, skipLimiting);
                             }
                         }
                 )
                 .outputMode("append")
                 .start();
     }
-
-    private void verifyRuns(BatchCollect batchCollect, LinkedList<Integer> runs) {
-        // test that 0-4 batches added data to 100 slots
-        List<Row> collectedList = batchCollect.getCollected();
-
-        TreeMap<Integer, Long> runToRow = new TreeMap<>();
-
-        int arraySize = collectedList.size();
-        while (arraySize != 0) {
-            Row row = collectedList.get(arraySize - 1);
-            int rowRun = Integer.parseInt(row.getString(6));
-
-            if(runToRow.containsKey(rowRun)) {
-                long value = runToRow.get(rowRun);
-                value++;
-                runToRow.put(rowRun, value);
-            }
-            else {
-                runToRow.put(rowRun, 1L);
-            }
-            arraySize--;
-
-        }
-
-        for(int run : runs) {
-            Assertions.assertEquals(20, runToRow.get(run), "batch "+ run +" contained other than 20 messages");
-        }
-    }
 }

Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ public void processAggregated(Dataset<Row> ds) {`
`138`	`138`	`this.inputSchema = ds.schema();`
`139`	`139`	`}`
`140`	`140`
`141`		`- List<Row> collected = orderDataset(ds).limit(numberOfRows).collectAsList();`
	`141`	`+ List<Row> collected = orderDataset(ds).collectAsList();`
`142`	`142`	`Dataset<Row> createdDsFromCollected = SparkSession.builder().getOrCreate().createDataFrame(collected, this.inputSchema);`
`143`	`143`
`144`	`144`	`if (this.savedDs == null) {`
`@@ -148,7 +148,7 @@ public void processAggregated(Dataset<Row> ds) {`
`148`	`148`	`this.savedDs = savedDs.union(createdDsFromCollected);`
`149`	`149`	`}`
`150`	`150`
`151`		`- this.savedDs = orderDataset(this.savedDs).limit(numberOfRows);`
	`151`	`+ this.savedDs = orderDataset(this.savedDs);`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`private Dataset<Row> orderDataset(Dataset<Row> ds) {`
`@@ -159,11 +159,6 @@ private Dataset<Row> orderDataset(Dataset<Row> ds) {`
`159`	`159`	`}`
`160`	`160`	`}`
`161`	`161`
`162`		`- // TODO: Remove`
`163`		`- public List<Row> getCollected() {`
`164`		`- return getCollectedAsDataframe().collectAsList();`
`165`		`- }`
`166`		`-`
`167`	`162`	`public Dataset<Row> getCollectedAsDataframe() {`
`168`	`163`	`Dataset<Row> rv;`
`169`	`164`	`if (this.lastRowDs != null) {`