use dropDuplicates for now in dedup as it is a POST_BATCHCOLLECT step (#643)

kortemik · web-flow · commit 7089c3b5c3cc · 2025-08-07T11:53:39.000+03:00
* use dropDuplicates for now in dedup as it is a POST_BATCHCOLLECT step

* apply spotless

* disable some more tests as dropDuplicates implementation does not implement these

* streaming dropDuplicates
diff --git a/src/main/java/com/teragrep/pth10/steps/dedup/DedupState.java b/src/main/java/com/teragrep/pth10/steps/dedup/DedupState.java
diff --git a/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java b/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java
@@ -47,14 +47,7 @@
 
 import com.teragrep.functions.dpf_02.AbstractStep;
 import com.teragrep.pth10.ast.NullValue;
-import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
-import org.apache.spark.sql.streaming.GroupState;
-import org.apache.spark.sql.streaming.GroupStateTimeout;
-import org.apache.spark.sql.streaming.OutputMode;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -81,7 +74,7 @@ public DedupStep(
             NullValue nullValue,
             boolean completeOutputMode
     ) {
-        this.properties.add(AbstractStep.CommandProperty.POST_BATCHCOLLECT);
+        //this.properties.add(AbstractStep.CommandProperty.POST_BATCHCOLLECT);
 
         this.listOfFields = listOfFields;
         this.maxDuplicates = maxDuplicates;
@@ -94,87 +87,7 @@ public DedupStep(
 
     @Override
     public Dataset<Row> get(Dataset<Row> dataset) {
-
-        final List<String> dedupHashFields = new ArrayList<>();
-        for (final String field : listOfFields) {
-            final String dedupHashField = "dedupHash-" + field;
-            dataset = dataset
-                    .withColumn(dedupHashField, functions.sha2(functions.col(field).cast(DataTypes.BinaryType), 256));
-            dedupHashFields.add(dedupHashField);
-        }
-
-        KeyValueGroupedDataset<String, Row> groupedDs = dataset.groupByKey((MapFunction<Row, String>) (r) -> {
-            final StringBuilder groupId = new StringBuilder();
-            for (final String hashField : dedupHashFields) {
-                groupId.append(r.getString(r.fieldIndex(hashField)));
-            }
-
-            return groupId.toString();
-        }, Encoders.STRING());
-
-        Dataset<Row> rv = groupedDs
-                .flatMapGroupsWithState(
-                        this::flatMapGroupsWithStateFunc, OutputMode
-                                .Append(),
-                        Encoders.javaSerialization(DedupState.class), RowEncoder.apply(dataset.schema()), GroupStateTimeout.NoTimeout()
-                );
-
-        return rv.drop(dedupHashFields.toArray(new String[0]));
-    }
-
-    private Iterator<Row> flatMapGroupsWithStateFunc(
-            final String group,
-            final Iterator<Row> events,
-            final GroupState<DedupState> state
-    ) {
-        final DedupState ds;
-        if (state.exists()) {
-            ds = state.get();
-        }
-        else {
-            ds = new DedupState();
-        }
-
-        List<Row> rv = new ArrayList<>();
-        events.forEachRemaining(event -> {
-            ds.accumulate(group);
-
-            boolean dropFullRow = false;
-
-            if (!keepEmpty) {
-                for (int i = 0; i < event.length(); i++) {
-                    final StructField field = event.schema().fields()[i];
-                    if (listOfFields.contains(field.name())) {
-                        final Object fieldValue = event.get(i);
-                        if (fieldValue == nullValue.value()) {
-                            // drop row, one of the fields is null
-                            dropFullRow = true;
-                            break;
-                        }
-                    }
-                }
-            }
-
-            if (!dropFullRow && ds.amountOf(group) <= maxDuplicates) {
-                rv.add(event);
-            }
-            else if (!dropFullRow && keepEvents) {
-                Object[] newRow = new Object[event.length()];
-                for (int i = 0; i < event.length(); i++) {
-                    final StructField field = event.schema().fields()[i];
-                    if (listOfFields.contains(field.name())) {
-                        newRow[i] = nullValue.value();
-                    }
-                    else {
-                        newRow[i] = event.get(i);
-                    }
-                }
-                rv.add(RowFactory.create(newRow));
-            }
-        });
-
-        state.update(ds);
-        return rv.iterator();
+        return dataset.withWatermark("_time", "1 hour").dropDuplicates(listOfFields.toArray(new String[0]));
     }
 
     public List<String> getListOfFields() {
diff --git a/src/test/java/com/teragrep/pth10/DedupTransformationTest.java b/src/test/java/com/teragrep/pth10/DedupTransformationTest.java
@@ -163,6 +163,7 @@ public void dedupTest_Consecutive() {
             named = "skipSparkTest",
             matches = "true"
     ) // sort descending as numbers
+    @Disabled(value = "Test disabled: sortby parameter is not implemented")
     public void dedupTest_SortNum() {
         String query = "index=index_A | dedup _raw sortby - num(_raw)";
         this.streamingTestUtil.performDPLTest(query, this.testFile, res -> {
@@ -192,6 +193,7 @@ public void dedupTest_SortNum() {
             named = "skipSparkTest",
             matches = "true"
     ) // keep duplicate events with nulls
+    @Disabled(value = "Test disabled: keepevents parameter is not implemented")
     public void dedupTest_KeepEvents() {
         String query = "index=index_A | dedup _raw keepevents= true";
         this.streamingTestUtil.performDPLTest(query, this.testFile, res -> {
@@ -228,6 +230,7 @@ public void dedupTest_KeepEvents() {
             named = "skipSparkTest",
             matches = "true"
     ) // keep null values
+    @Disabled(value = "Test disabled: keepevents parameter is not implemented")
     public void dedupTest_KeepEmpty() {
         // first use keepevents=true to make null values in the dataset
         String query = "index=index_A | dedup _raw keepevents= true | dedup _raw keepempty= true";