-
Notifications
You must be signed in to change notification settings - Fork 123
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #997 from sania-16/working
ZIP
- Loading branch information
Showing
7 changed files
with
153 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
common/core/src/test/java/zingg/common/core/preprocess/TestPreprocessors.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package zingg.common.core.preprocess; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
import org.apache.commons.logging.Log; | ||
import org.apache.commons.logging.LogFactory; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import zingg.common.client.Arguments; | ||
import zingg.common.client.ArgumentsUtil; | ||
import zingg.common.client.IArguments; | ||
import zingg.common.client.ZFrame; | ||
import zingg.common.client.ZinggClientException; | ||
import zingg.common.client.util.DFObjectUtil; | ||
import zingg.common.core.context.Context; | ||
import zingg.common.core.data.EventTestData; | ||
import zingg.common.core.model.PriorStopWordProcess; | ||
|
||
public abstract class TestPreprocessors<S,D,R,C,T> { | ||
|
||
public static final Log LOG = LogFactory.getLog(TestPreprocessors.class); | ||
protected ArgumentsUtil<Arguments> argsUtil = new ArgumentsUtil<Arguments>(Arguments.class); | ||
private final DFObjectUtil<S, D, R, C> dfObjectUtil; | ||
private final Context<S, D, R, C, T> context; | ||
|
||
public TestPreprocessors(DFObjectUtil<S, D, R, C> dfObjectUtil, Context<S, D, R, C, T> context) { | ||
this.dfObjectUtil = dfObjectUtil; | ||
this.context = context; | ||
} | ||
|
||
@Test | ||
public void TestPreprocessorsFlow() throws ZinggClientException, Exception{ | ||
IArguments args = argsUtil.createArgumentsFromJSON(TestPreprocessors.class.getResource("/Users/sania/zingg/common/core/src/test/resources/preProcess/configTestPreprocess.json").getFile(), "test"); | ||
|
||
IPreprocessors<S,D,R,C,T> preprocessors = getPreprocessors(); | ||
|
||
ZFrame<D,R,C> inputDF = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Original(), PriorStopWordProcess.class); | ||
ZFrame<D,R,C> expectedDF = dfObjectUtil.getDFFromObjectList(EventTestData.getData2Expected(), PriorStopWordProcess.class); | ||
|
||
ZFrame<D,R,C> resultDF = preprocessors.preprocess(inputDF); | ||
|
||
assertTrue(resultDF.except(expectedDF).isEmpty()); | ||
assertTrue(expectedDF.except(resultDF).isEmpty()); | ||
|
||
} | ||
|
||
public abstract IPreprocessors<S,D,R,C,T> getPreprocessors(); | ||
|
||
} |
59 changes: 59 additions & 0 deletions
59
common/core/src/test/resources/preProcess/configTestPreprocess.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
{ | ||
"fieldDefinition":[ | ||
{ | ||
"fieldName" : "z_zid", | ||
"matchType" : "dont_use", | ||
"fields" : "z_zid", | ||
"dataType": "string" | ||
}, | ||
{ | ||
"fieldName" : "field1", | ||
"matchType" : "fuzzy", | ||
"fields" : "field1", | ||
"dataType": "string", | ||
"stopwords":"common/core/src/test/resources/preProcess/stopWords.csv" | ||
}, | ||
{ | ||
"fieldName" : "field2", | ||
"matchType": "exact", | ||
"fields" : "field2", | ||
"dataType": "string" | ||
}, | ||
{ | ||
"fieldName" : "field3", | ||
"matchType": "fuzzy,dont_use", | ||
"fields" : "field3", | ||
"dataType": "string" | ||
}, | ||
{ | ||
"fieldName" : "z_zsource", | ||
"matchType": "dont_use", | ||
"fields" : "z_zsource", | ||
"dataType": "string" | ||
} | ||
], | ||
"output" : [{ | ||
"name":"output", | ||
"format":"csv", | ||
"props": { | ||
"location": "/tmp/zinggOutput", | ||
"delimiter": ",", | ||
"header":true | ||
} | ||
}], | ||
"data" : [{ | ||
"name":"test", | ||
"format":"csv", | ||
"props": { | ||
"location": "common/core/src/test/resources/preProcess/testPreprocess.csv", | ||
"delimiter": ",", | ||
"header":false | ||
}, | ||
"schema": "z_zid string, field1 string, field2 string, field3 string, z_zsource string" | ||
}], | ||
"labelDataSampleSize" : 0.5, | ||
"numPartitions":4, | ||
"modelId": 100, | ||
"zinggDir": "models" | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
10, The zingg is a spark application, two, Yes. a good application, test | ||
20, It is very popular in Data Science, Three, true indeed, test | ||
30, It is written in java and scala, four, , test | ||
40, Best of luck to zingg Mobile/T-Mobile, Five, thank you, test |
37 changes: 37 additions & 0 deletions
37
spark/core/src/test/java/zingg/spark/core/preprocess/TestSparkPreprocessors.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package zingg.spark.core.preprocess; | ||
|
||
import org.apache.spark.sql.Column; | ||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
import org.apache.spark.sql.SparkSession; | ||
import org.apache.spark.sql.types.DataType; | ||
|
||
import org.junit.jupiter.api.extension.ExtendWith; | ||
import zingg.spark.core.TestSparkBase; | ||
import zingg.common.client.ZinggClientException; | ||
import zingg.common.client.util.IWithSession; | ||
import zingg.common.client.util.WithSession; | ||
import zingg.common.core.preprocess.IPreprocessors; | ||
import zingg.common.core.preprocess.TestPreprocessors; | ||
import zingg.spark.client.util.SparkDFObjectUtil; | ||
import zingg.spark.core.context.ZinggSparkContext; | ||
import zingg.spark.core.executor.SparkTrainingDataFinder; | ||
|
||
@ExtendWith(TestSparkBase.class) | ||
public class TestSparkPreprocessors extends TestPreprocessors<SparkSession, Dataset<Row>, Row, Column, DataType> { | ||
|
||
public static IWithSession<SparkSession> iWithSession = new WithSession<SparkSession>(); | ||
public static ZinggSparkContext zsCTX = new ZinggSparkContext(); | ||
|
||
public TestSparkPreprocessors(SparkSession sparkSession) throws ZinggClientException{ | ||
super(new SparkDFObjectUtil(iWithSession), zsCTX); | ||
iWithSession.setSession(sparkSession); | ||
zsCTX.init(sparkSession); | ||
} | ||
|
||
@Override | ||
public IPreprocessors<SparkSession, Dataset<Row>, Row, Column, DataType> getPreprocessors() { | ||
return new SparkTrainingDataFinder(zsCTX); | ||
} | ||
|
||
} |
2 changes: 0 additions & 2 deletions
2
spark/core/src/test/java/zingg/spark/core/util/SparkStopWordRemoverUtility.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters