Skip to content

Commit

Permalink
Merge pull request #997 from sania-16/working
Browse files Browse the repository at this point in the history
ZIP
  • Loading branch information
sania-16 authored Jan 12, 2025
2 parents 0909688 + 6ead69b commit 9bd24bd
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ public void testExecutors() throws ZinggClientException, IOException {
executorTester.validateResults();
}
} catch (Throwable throwable) {
throw new ZinggClientException("Exception occurred while running one or more test executors, " + throwable.getMessage());
throwable.printStackTrace();
throw new ZinggClientException("Exception occurred while running one or more test executors, " + throwable.getMessage());
}

}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package zingg.common.core.preprocess;

import static org.junit.jupiter.api.Assertions.assertTrue;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.jupiter.api.Test;

import zingg.common.client.Arguments;
import zingg.common.client.ArgumentsUtil;
import zingg.common.client.IArguments;
import zingg.common.client.ZFrame;
import zingg.common.client.ZinggClientException;
import zingg.common.client.util.DFObjectUtil;
import zingg.common.core.context.Context;
import zingg.common.core.data.EventTestData;
import zingg.common.core.model.PriorStopWordProcess;

public abstract class TestPreprocessors<S,D,R,C,T> {

public static final Log LOG = LogFactory.getLog(TestPreprocessors.class);
protected ArgumentsUtil<Arguments> argsUtil = new ArgumentsUtil<Arguments>(Arguments.class);
private final DFObjectUtil<S, D, R, C> dfObjectUtil;
private final Context<S, D, R, C, T> context;

public TestPreprocessors(DFObjectUtil<S, D, R, C> dfObjectUtil, Context<S, D, R, C, T> context) {
this.dfObjectUtil = dfObjectUtil;
this.context = context;
}

@Test
public void TestPreprocessorsFlow() throws ZinggClientException, Exception{
IArguments args = argsUtil.createArgumentsFromJSON(TestPreprocessors.class.getResource("/Users/sania/zingg/common/core/src/test/resources/preProcess/configTestPreprocess.json").getFile(), "test");

IPreprocessors<S,D,R,C,T> preprocessors = getPreprocessors();

ZFrame<D,R,C> inputDF = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Original(), PriorStopWordProcess.class);
ZFrame<D,R,C> expectedDF = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Expected(), PriorStopWordProcess.class);

ZFrame<D,R,C> resultDF = preprocessors.preprocess(inputDF);

assertTrue(resultDF.except(expectedDF).isEmpty());
assertTrue(expectedDF.except(resultDF).isEmpty());

}

public abstract IPreprocessors<S,D,R,C,T> getPreprocessors();

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"fieldDefinition":[
{
"fieldName" : "z_zid",
"matchType" : "dont_use",
"fields" : "z_zid",
"dataType": "string"
},
{
"fieldName" : "field1",
"matchType" : "fuzzy",
"fields" : "field1",
"dataType": "string",
"stopwords":"common/core/src/test/resources/preProcess/stopWords.csv"
},
{
"fieldName" : "field2",
"matchType": "exact",
"fields" : "field2",
"dataType": "string"
},
{
"fieldName" : "field3",
"matchType": "fuzzy,dont_use",
"fields" : "field3",
"dataType": "string"
},
{
"fieldName" : "z_zsource",
"matchType": "dont_use",
"fields" : "z_zsource",
"dataType": "string"
}
],
"output" : [{
"name":"output",
"format":"csv",
"props": {
"location": "/tmp/zinggOutput",
"delimiter": ",",
"header":true
}
}],
"data" : [{
"name":"test",
"format":"csv",
"props": {
"location": "common/core/src/test/resources/preProcess/testPreprocess.csv",
"delimiter": ",",
"header":false
},
"schema": "z_zid string, field1 string, field2 string, field3 string, z_zsource string"
}],
"labelDataSampleSize" : 0.5,
"numPartitions":4,
"modelId": 100,
"zinggDir": "models"

}
4 changes: 4 additions & 0 deletions common/core/src/test/resources/preProcess/testPreprocess.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
10, The zingg is a spark application, two, Yes. a good application, test
20, It is very popular in Data Science, Three, true indeed, test
30, It is written in java and scala, four, , test
40, Best of luck to zingg Mobile/T-Mobile, Five, thank you, test
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package zingg.spark.core.preprocess;

import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataType;

import org.junit.jupiter.api.extension.ExtendWith;
import zingg.spark.core.TestSparkBase;
import zingg.common.client.ZinggClientException;
import zingg.common.client.util.IWithSession;
import zingg.common.client.util.WithSession;
import zingg.common.core.preprocess.IPreprocessors;
import zingg.common.core.preprocess.TestPreprocessors;
import zingg.spark.client.util.SparkDFObjectUtil;
import zingg.spark.core.context.ZinggSparkContext;
import zingg.spark.core.executor.SparkTrainingDataFinder;

@ExtendWith(TestSparkBase.class)
public class TestSparkPreprocessors extends TestPreprocessors<SparkSession, Dataset<Row>, Row, Column, DataType> {

public static IWithSession<SparkSession> iWithSession = new WithSession<SparkSession>();
public static ZinggSparkContext zsCTX = new ZinggSparkContext();

public TestSparkPreprocessors(SparkSession sparkSession) throws ZinggClientException{
super(new SparkDFObjectUtil(iWithSession), zsCTX);
iWithSession.setSession(sparkSession);
zsCTX.init(sparkSession);
}

@Override
public IPreprocessors<SparkSession, Dataset<Row>, Row, Column, DataType> getPreprocessors() {
return new SparkTrainingDataFinder(zsCTX);
}

}
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package zingg.spark.core.util;

import java.lang.reflect.Field;

import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"fieldName" : "add1",
"matchType": "fuzzy",
"fields" : "add1",
"dataType": "string"
"dataType": "string",
"stopWords":"spark/core/src/test/resources/zingg/spark/core/executor/stopwords/add1.csv"
},
{
"fieldName" : "add2",
Expand Down

0 comments on commit 9bd24bd

Please sign in to comment.