Skip to content

Commit 8ee3474

Browse files
authored
Adds wildcard support for fields command (#736)
* added new object FilteredColumns.java to check for wildcards in the given fields, added wildcard tests for fields command * added the right LOGGER and removed multiple return statements * tested objects equality with EqualsVerifier * reversed if statement and added an extra check for missing fields * adds checks for if fieldType list is empty
1 parent a603b7b commit 8ee3474

6 files changed

Lines changed: 408 additions & 79 deletions

File tree

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
* Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10)
3+
* Copyright (C) 2019-2025 Suomen Kanuuna Oy
4+
*
5+
* This program is free software: you can redistribute it and/or modify
6+
* it under the terms of the GNU Affero General Public License as published by
7+
* the Free Software Foundation, either version 3 of the License, or
8+
* (at your option) any later version.
9+
*
10+
* This program is distributed in the hope that it will be useful,
11+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
* GNU Affero General Public License for more details.
14+
*
15+
* You should have received a copy of the GNU Affero General Public License
16+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
17+
*
18+
*
19+
* Additional permission under GNU Affero General Public License version 3
20+
* section 7
21+
*
22+
* If you modify this Program, or any covered work, by linking or combining it
23+
* with other code, such other code is not for that reason alone subject to any
24+
* of the requirements of the GNU Affero GPL version 3 as long as this Program
25+
* is the same Program as licensed from Suomen Kanuuna Oy without any additional
26+
* modifications.
27+
*
28+
* Supplemented terms under GNU Affero General Public License version 3
29+
* section 7
30+
*
31+
* Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
32+
* versions must be marked as "Modified version of" The Program.
33+
*
34+
* Names of the licensors and authors may not be used for publicity purposes.
35+
*
36+
* No rights are granted for use of trade names, trademarks, or service marks
37+
* which are in The Program if any.
38+
*
39+
* Licensee must indemnify licensors and authors for any liability that these
40+
* contractual assumptions impose on licensors and authors.
41+
*
42+
* To the extent this program is licensed as part of the Commercial versions of
43+
* Teragrep, the applicable Commercial License may apply to this file if you as
44+
* a licensee so wish it.
45+
*/
46+
package com.teragrep.pth_10.ast;
47+
48+
import org.slf4j.Logger;
49+
import org.slf4j.LoggerFactory;
50+
51+
import java.util.ArrayList;
52+
import java.util.Arrays;
53+
import java.util.List;
54+
import java.util.Objects;
55+
import java.util.regex.Matcher;
56+
import java.util.regex.Pattern;
57+
58+
public final class FilteredColumns {
59+
60+
private static final Logger LOGGER = LoggerFactory.getLogger(FilteredColumns.class);
61+
private final String[] columns;
62+
private final String wc;
63+
64+
/**
65+
* Checks for wildcards from given array of column names
66+
*
67+
* @param wc wildcard statement
68+
* @param columns array of column names
69+
*/
70+
71+
public FilteredColumns(final String wc, final String[] columns) {
72+
this.columns = columns;
73+
this.wc = wc;
74+
}
75+
76+
/**
77+
* @return list of column names that match the wildcard statement
78+
*/
79+
80+
public List<String> filtered() {
81+
final StringBuilder quotablePartBuilder = new StringBuilder();
82+
final StringBuilder regexBuilder = new StringBuilder();
83+
final String regex;
84+
85+
for (final char c : wc.toCharArray()) {
86+
if (c == '*') {
87+
// On wildcard, get preceding content and quote it
88+
// Also clear quotablePartBuilder and add regex any char wildcard
89+
if (quotablePartBuilder.length() > 0) {
90+
regexBuilder.append(Pattern.quote(quotablePartBuilder.toString()));
91+
quotablePartBuilder.setLength(0);
92+
}
93+
regexBuilder.append(".*");
94+
}
95+
else {
96+
// On normal characters, add to quotablePartBuilder
97+
quotablePartBuilder.append(c);
98+
}
99+
}
100+
101+
if (quotablePartBuilder.length() > 0) {
102+
// if quotablePartBuilder is not empty, quote and add it
103+
regex = regexBuilder + Pattern.quote(quotablePartBuilder.toString());
104+
}
105+
else {
106+
// if it is empty, the regexBuilder contains the final regex
107+
regex = regexBuilder.toString();
108+
}
109+
110+
final Pattern p = Pattern.compile(regex);
111+
Matcher m;
112+
final List<String> matchedFields = new ArrayList<>();
113+
114+
for (final String column : columns) {
115+
m = p.matcher(column);
116+
if (m.matches()) {
117+
LOGGER.debug("Field <[{}]> matches the wildcard rule: <[{}]>", column, wc);
118+
matchedFields.add(column);
119+
}
120+
}
121+
122+
return matchedFields;
123+
}
124+
125+
@Override
126+
public boolean equals(Object o) {
127+
final boolean isEquals;
128+
if (this == o)
129+
isEquals = true;
130+
else if (o == null || getClass() != o.getClass())
131+
isEquals = false;
132+
else {
133+
final FilteredColumns filteredColumns = (FilteredColumns) o;
134+
isEquals = Objects.equals(wc, filteredColumns.wc) && Arrays.equals(columns, filteredColumns.columns);
135+
}
136+
return isEquals;
137+
}
138+
139+
@Override
140+
public int hashCode() {
141+
return Objects.hash(wc, Arrays.hashCode(columns));
142+
}
143+
}

src/main/java/com/teragrep/pth_10/ast/commands/transformstatement/FieldsTransformation.java

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -78,20 +78,29 @@ public Node visitFieldsTransformation(DPLParser.FieldsTransformationContext ctx)
7878

7979
public Node fieldsTransformationEmitCatalyst(DPLParser.FieldsTransformationContext ctx) {
8080
this.fieldsStep = new FieldsStep();
81-
82-
String oper = ctx.getChild(1).getText();
83-
84-
if ("-".equals(oper)) {
85-
StringListNode sln = (StringListNode) visit(ctx.fieldListType());
86-
LOGGER.debug("Drop fields: stringListNode=<{}>", sln);
87-
88-
this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.REMOVE_FIELDS);
89-
this.fieldsStep.setListOfFields(sln.asList());
81+
// if fields command has no arguments
82+
if (
83+
ctx.fieldListType() == null
84+
|| ctx.fieldListType().fieldType() == null || ctx.fieldListType().fieldType().isEmpty()
85+
|| ctx.fieldListType().fieldType().get(0).getText().isEmpty()
86+
) {
87+
throw new IllegalStateException(
88+
"fields command is missing field names, it requires at least one valid field name."
89+
);
9090
}
9191
else {
92-
StringListNode sln = (StringListNode) visit(ctx.fieldListType());
93-
this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.KEEP_FIELDS);
94-
this.fieldsStep.setListOfFields(sln.asList());
92+
if (ctx.COMMAND_FIELDS_MODE_MINUS() != null) {
93+
final StringListNode sln = (StringListNode) visit(ctx.fieldListType());
94+
LOGGER.debug("Drop fields: stringListNode=<{}>", sln);
95+
96+
this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.REMOVE_FIELDS);
97+
this.fieldsStep.setListOfFields(sln.asList());
98+
}
99+
else {
100+
final StringListNode sln = (StringListNode) visit(ctx.fieldListType());
101+
this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.KEEP_FIELDS);
102+
this.fieldsStep.setListOfFields(sln.asList());
103+
}
95104
}
96105
return new StepNode(fieldsStep);
97106
}
@@ -113,7 +122,7 @@ public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) {
113122
}
114123

115124
public Node visitFieldType(DPLParser.FieldTypeContext ctx) {
116-
String sql = ctx.getChild(0).getText();
125+
final String sql = ctx.getChild(0).getText();
117126
return new StringNode(new Token(Type.STRING, sql));
118127
}
119128
}

src/main/java/com/teragrep/pth_10/steps/fields/FieldsStep.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,20 @@
4545
*/
4646
package com.teragrep.pth_10.steps.fields;
4747

48+
import com.teragrep.pth_10.ast.FilteredColumns;
4849
import org.apache.spark.sql.Dataset;
4950
import org.apache.spark.sql.Row;
51+
import org.slf4j.Logger;
52+
import org.slf4j.LoggerFactory;
5053
import scala.collection.JavaConversions;
5154

55+
import java.util.ArrayList;
56+
import java.util.List;
57+
5258
public final class FieldsStep extends AbstractFieldsStep {
5359

60+
private static final Logger LOGGER = LoggerFactory.getLogger(FieldsStep.class);
61+
5462
public FieldsStep() {
5563
super();
5664
}
@@ -61,14 +69,31 @@ public Dataset<Row> get(Dataset<Row> dataset) {
6169
return null;
6270
}
6371

64-
if (this.mode.equals(FieldMode.KEEP_FIELDS)) {
65-
return dataset.selectExpr(JavaConversions.asScalaBuffer(this.listOfFields));
72+
// check for wildcards in columns
73+
final List<String> wildcardColumns = new ArrayList<>();
74+
for (String field : this.listOfFields) {
75+
LOGGER.debug("Checking column <[{}]> for wildcards", field);
76+
final FilteredColumns filteredFields = new FilteredColumns(field, dataset.columns());
77+
wildcardColumns.addAll(filteredFields.filtered());
6678
}
67-
else if (this.mode.equals(FieldMode.REMOVE_FIELDS)) {
68-
return dataset.drop(JavaConversions.asScalaBuffer(this.listOfFields));
79+
80+
Dataset<Row> finalDataset;
81+
if (!wildcardColumns.isEmpty()) {
82+
if (this.mode.equals(FieldMode.KEEP_FIELDS)) {
83+
finalDataset = dataset.selectExpr(JavaConversions.asScalaBuffer(wildcardColumns));
84+
}
85+
else if (this.mode.equals(FieldMode.REMOVE_FIELDS)) {
86+
finalDataset = dataset.drop(JavaConversions.asScalaBuffer(wildcardColumns));
87+
}
88+
else {
89+
throw new UnsupportedOperationException("Invalid FieldMode: " + this.mode);
90+
}
6991
}
7092
else {
71-
throw new UnsupportedOperationException("Invalid FieldMode: " + this.mode);
93+
throw new IllegalStateException(
94+
"fields command is missing field names, it requires at least one valid field name."
95+
);
7296
}
97+
return finalDataset;
7398
}
7499
}

src/main/java/com/teragrep/pth_10/steps/table/TableStep.java

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
*/
4646
package com.teragrep.pth_10.steps.table;
4747

48+
import com.teragrep.pth_10.ast.FilteredColumns;
4849
import org.apache.spark.sql.Column;
4950
import org.apache.spark.sql.Dataset;
5051
import org.apache.spark.sql.Row;
@@ -56,8 +57,6 @@
5657

5758
import java.util.ArrayList;
5859
import java.util.List;
59-
import java.util.regex.Matcher;
60-
import java.util.regex.Pattern;
6160
import java.util.stream.Collectors;
6261

6362
public final class TableStep extends AbstractTableStep {
@@ -78,7 +77,8 @@ public Dataset<Row> get(Dataset<Row> dataset) {
7877
List<String> wildcardedFields = new ArrayList<>();
7978
for (String field : this.listOfFields) {
8079
LOGGER.debug("Check field <[{}]> for wildcards", field);
81-
wildcardedFields.addAll(getWildcardFields(field, dataset.columns()));
80+
final FilteredColumns filteredFields = new FilteredColumns(field, dataset.columns());
81+
wildcardedFields.addAll(filteredFields.filtered());
8282
}
8383

8484
Dataset<Row> dsWithDroppedCols = dataset;
@@ -102,58 +102,5 @@ public Dataset<Row> get(Dataset<Row> dataset) {
102102
else {
103103
return dsWithDroppedCols;
104104
}
105-
106-
}
107-
108-
/**
109-
* Gets wildcarded fields from given array of column names
110-
*
111-
* @param wc wildcard statement
112-
* @param cols array of column names
113-
* @return list of column names which match the wildcard statement
114-
*/
115-
private List<String> getWildcardFields(String wc, String[] cols) {
116-
StringBuilder quotablePartBuilder = new StringBuilder();
117-
StringBuilder regexBuilder = new StringBuilder();
118-
final String regex;
119-
120-
for (char c : wc.toCharArray()) {
121-
if (c == '*') {
122-
// On wildcard, get preceding content and quote it
123-
// Also clear quotablePartBuilder and add regex any char wildcard
124-
if (quotablePartBuilder.length() > 0) {
125-
regexBuilder.append(Pattern.quote(quotablePartBuilder.toString()));
126-
quotablePartBuilder.setLength(0);
127-
}
128-
regexBuilder.append(".*");
129-
}
130-
else {
131-
// On normal characters, add to quotablePartBuilder
132-
quotablePartBuilder.append(c);
133-
}
134-
}
135-
136-
if (quotablePartBuilder.length() > 0) {
137-
// if quotablePartBuilder is not empty, quote and add it
138-
regex = Pattern.quote(quotablePartBuilder.toString());
139-
}
140-
else {
141-
// if it is empty, the regexBuilder contains the final regex
142-
regex = regexBuilder.toString();
143-
}
144-
145-
Pattern p = Pattern.compile(regex);
146-
Matcher m;
147-
List<String> matchedFields = new ArrayList<>();
148-
149-
for (String column : cols) {
150-
m = p.matcher(column);
151-
if (m.matches()) {
152-
LOGGER.debug("Field <[{}]> matches the wildcard rule: <[{}]>", column, wc);
153-
matchedFields.add(column);
154-
}
155-
}
156-
157-
return matchedFields;
158105
}
159106
}

0 commit comments

Comments
 (0)