diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index a52ee2cd7..000000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 3a1089ff1..9a1fb7cfb 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -30,6 +30,6 @@ sphinx: # Optional but recommended, declare the Python requirements required # to build your documentation # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html -# python: -# install: -# - requirements: docs/requirements.txt +python: + install: + - requirements: python/requirements.txt diff --git a/docs/README.md b/docs/README.md index b610e9f13..42c734a6e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,13 +2,12 @@ description: Hope you find us useful :-) --- -# Welcome to Zingg +# Welcome To Zingg ![](https://static.scarf.sh/a.png?x-pxid=d6dda06e-06c7-4e4a-99c9-ed9f6364dfeb) This is the latest documentation for Zingg. Release wise documentation can be accessed through: -* [v0.4.1 ](https://docs.zingg.ai/zingg0.4.1/) * [v0.4.0 ](https://docs.zingg.ai/zingg0.4.0/) * [v0.3.4 ](https://docs.zingg.ai/zingg0.3.4/) * [v0.3.3](https://docs.zingg.ai/zingg0.3.3/) @@ -25,4 +24,4 @@ Zingg is a quick and scalable way to build a single source of truth for core bus ## Book Office Hours -If you want to schedule a 30-min call with our team to help you get set up, please select some time directly [here](https://calendly.com/sonalgoyal/30min). +If you want to schedule a 30-min call with our team to help you get set up, please select a slot directly [here](https://calendly.com/sonalgoyal/30min). diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 234cf5596..6608c8212 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,13 +1,13 @@ # Table of contents -* [Welcome to Zingg](README.md) +* [Welcome To Zingg](README.md) * [Step-By-Step Guide](stepByStep.md) * [Installation](setup/installation.md) * [Docker](stepbystep/installation/docker/README.md) - * [Sharing custom data and config files](stepbystep/installation/docker/sharing-custom-data-and-config-files.md) - * [Shared locations](stepbystep/installation/docker/shared-locations.md) - * [File read/write permissions](stepbystep/installation/docker/file-read-write-permissions.md) - * [Copying Files To and From the Container](stepbystep/installation/docker/copying-files-to-and-from-the-container.md) + * [Sharing Custom Data And Config Files](stepbystep/installation/docker/sharing-custom-data-and-config-files.md) + * [Shared Locations](stepbystep/installation/docker/shared-locations.md) + * [File Read/Write Permissions](stepbystep/installation/docker/file-read-write-permissions.md) + * [Copying Files To And From The Container](stepbystep/installation/docker/copying-files-to-and-from-the-container.md) * [Installing From Release](stepbystep/installation/installing-from-release/README.md) * [Single Machine Setup](stepbystep/installation/installing-from-release/single-machine-setup.md) * [Spark Cluster Checklist](stepbystep/installation/installing-from-release/spark-cluster-checklist.md) @@ -19,10 +19,11 @@ * [Zingg Command Line](stepbystep/zingg-command-line.md) * [Configuration](stepbystep/configuration/README.md) * [Configuring Through Environment Variables](stepbystep/configuration/configuring-through-environment-variables.md) - * [Data Input and Output](stepbystep/configuration/data-input-and-output/README.md) + * [Data Input And Output](stepbystep/configuration/data-input-and-output/README.md) * [Input Data](stepbystep/configuration/data-input-and-output/data.md) * [Output](stepbystep/configuration/data-input-and-output/output.md) * [Field Definitions](stepbystep/configuration/field-definitions.md) + * [Deterministic Matching](deterministicMatching.md) * [Model Location](stepbystep/configuration/model-location.md) * [Tuning Label, Match And Link Jobs](stepbystep/configuration/tuning-label-match-and-link-jobs.md) * [Telemetry](stepbystep/configuration/telemetry.md) @@ -30,12 +31,13 @@ * [Finding Records For Training Set Creation](setup/training/findTrainingData.md) * [Labeling Records](setup/training/label.md) * [Find And Label](setup/training/findAndLabel.md) - * [Using pre-existing training data](setup/training/addOwnTrainingData.md) + * [Using Pre-existing Training Data](setup/training/addOwnTrainingData.md) * [Updating Labeled Pairs](updatingLabels.md) * [Exporting Labeled Data](setup/training/exportLabeledData.md) - * [Building and saving the model](setup/train.md) - * [Finding the matches](setup/match.md) - * [Linking across datasets](setup/link.md) + * [Building And Saving The Model](setup/train.md) + * [Finding The Matches](setup/match.md) + * [Adding Incremental Data](runIncremental.md) + * [Linking Across Datasets](setup/link.md) * [Data Sources and Sinks](dataSourcesAndSinks/connectors.md) * [Zingg Pipes](dataSourcesAndSinks/pipes.md) * [Databricks](dataSourcesAndSinks/databricks.md) @@ -51,19 +53,20 @@ * [BigQuery](dataSourcesAndSinks/bigquery.md) * [Exasol](dataSourcesAndSinks/exasol.md) * [Working With Python](working-with-python.md) -* [Running Zingg on Cloud](running/running.md) - * [Running on AWS](running/aws.md) - * [Running on Azure](running/azure.md) - * [Running on Databricks](running/databricks.md) + * [Python API](python/markdown/index.md) +* [Running Zingg On Cloud](running/running.md) + * [Running On AWS](running/aws.md) + * [Running On Azure](running/azure.md) + * [Running On Databricks](running/databricks.md) * [Zingg Models](zModels.md) - * [Pre-trained models](pretrainedModels.md) + * [Pre-Trained Models](pretrainedModels.md) * [Improving Accuracy](improving-accuracy/README.md) * [Ignoring Commonly Occuring Words While Matching](accuracy/stopWordsRemoval.md) * [Defining Domain Specific Blocking And Similarity Functions](accuracy/definingOwn.md) * [Documenting The Model](generatingdocumentation.md) * [Interpreting Output Scores](scoring.md) -* [Reporting bugs and contributing](contributing.md) - * [Setting Zingg Development Environment](settingUpZingg.md) +* [Reporting Bugs And Contributing](contributing.md) + * [Setting Up Zingg Development Environment](settingUpZingg.md) * [Community](community.md) * [Frequently Asked Questions](faq.md) * [Reading Material](reading.md) diff --git a/docs/accuracy/definingOwn.md b/docs/accuracy/definingOwn.md index dccc195ef..9e5ad0dcd 100644 --- a/docs/accuracy/definingOwn.md +++ b/docs/accuracy/definingOwn.md @@ -3,27 +3,27 @@ nav_order: 6 description: To add blocking functions and how they work --- -# Defining Own Functions +# Defining Domain Specific Blocking And Similarity Functions You can add your own [blocking functions](https://github.com/zinggAI/zingg/tree/main/common/core/src/main/java/zingg/common/core/hash) which will be evaluated by Zingg to build the [blocking tree.](../zModels.md) -The blocking tree works on the matched records provided by the user as part of the training. At every node, it selects the hash function and the field on which it should be applied so that there is the least elimination of the matching pairs. Say we have data like this: +The blocking tree works on the matched records provided by the user as part of the training. At every node, it selects the hash function and the field on which it should be applied so that there is the least elimination of the matching pairs. \ +\ +Say we have data like this: | Pair 1 | firstname | lastname | | :------: | :-------: | :------: | | Record A | john | doe | | Record B | johnh | d oe | -**** +*** | Pair 2 | firstname | lastname | | :-------: | :-------: | :------: | | Rrecord A | mary | ann | | Record B | marry | | - - -Let us assume we have hash function first1char and we want to check if it is a good function to apply to firstname: +Let us assume we have hash function **first1char** and we want to check if it is a good function to apply to **firstname**: | Pair | Record | Output | | :--: | :------: | ------ | @@ -34,9 +34,7 @@ Let us assume we have hash function first1char and we want to check if it is a g There is no elimination in the pairs above, hence it is a good function. - - -Now let us try last1char on firstname +Now let us try **last1char** on **firstname:** | Pair | Record | Output | | :--: | :------: | ------ | @@ -45,12 +43,12 @@ Now let us try last1char on firstname | 2 | Record A | y | | 2 | Record B | y | -Pair 1 is getting eliminated above, hence last1char is not a good function. +Pair 1 is getting eliminated above, hence **last1char** is not a good function. -So, first1char(firstname) will be chosen. This brings near similar records together - in a way, clusters them to break the cartesian join. +So, **first1char**(**firstname**) will be chosen. This brings near similar records together - in a way, clusters them to break the cartesian join. These business-specific blocking functions go into [Hash Functions](https://github.com/zinggAI/zingg/tree/main/common/core/src/main/java/zingg/common/core/hash) and must be added to [HashFunctionRegistry](../../common/core/src/main/java/zingg/common/core/hash/HashFunctionRegistry.java) and [hash functions config](../../common/core/src/main/resources/hashFunctions.json). -Also, for similarity, you can define your own measures. Each dataType has predefined features, for example, [String](../../common/core/src/main/java/zingg/common/core/feature/StringFeature.java) fuzzy type is configured for Affine and Jaro. +Also, for similarity, you can define your own measures. Each **dataType** has predefined features, for example, [String](../../common/core/src/main/java/zingg/common/core/feature/StringFeature.java) fuzzy type is configured for Affine and Jaro. You can define your own [comparisons](https://github.com/zinggAI/zingg/tree/main/common/core/src/main/java/zingg/common/core/similarity/function) and use them. diff --git a/docs/accuracy/stopWordsRemoval.md b/docs/accuracy/stopWordsRemoval.md index 752f7bb22..5ca5ef32f 100644 --- a/docs/accuracy/stopWordsRemoval.md +++ b/docs/accuracy/stopWordsRemoval.md @@ -1,12 +1,10 @@ # Ignoring Commonly Occuring Words While Matching -Common words like Mr, Pvt, Av, St, Street etc do not add differential signal and confuse matching. These words are called stopwords and matching is more accurate when stopwrods are ignored. +Common words like Mr, Pvt, Av, St, Street etc. do not add differential signals and confuse matching. These words are called **stopwords** and matching is more accurate when stopwords are ignored. -In order to remove stopwords from a field, configure +The stopwords can be recommended by Zingg by invoking: -The stopwords can be recommended by Zingg by invoking - -`./scripts/zingg.sh --phase recommend --conf --columns ` +`./scripts/zingg.sh --phase recommend --conf --columns ` By default, Zingg extracts 10% of the high-frequency unique words from a dataset. If the user wants a different selection, they should set up the following property in the config file: @@ -14,7 +12,7 @@ By default, Zingg extracts 10% of the high-frequency unique words from a dataset stopWordsCutoff: ``` -Once you have verified the above stop words, you can configure them in the JSON variable **stopWords** with the path to the CSV file containing them. Please ensure while editing the CSV or building it manually that it should contain one word per row. +Once you have verified the above stop words, you can configure them in the JSON variable **stopWords** with the path to the CSV file containing them. Please ensure while editing the CSV or building it manually that it should contain _one word per row_. ``` "fieldDefinition":[ @@ -26,4 +24,3 @@ Once you have verified the above stop words, you can configure them in the JSON "stopWords": "models/100/stopWords/fname.csv" }, ``` - diff --git a/docs/connectors/jdbc/mysql.md b/docs/connectors/jdbc/mysql.md index c17ddd89c..e92de17db 100644 --- a/docs/connectors/jdbc/mysql.md +++ b/docs/connectors/jdbc/mysql.md @@ -1,6 +1,6 @@ # MySQL -## Reading from MySQL database: +## Reading From MySQL Database: ```json "data" : [{ @@ -16,4 +16,4 @@ }], ``` -Please replace \ with the name of the database in addition to other props. For more details, refer to the [spark documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). +Please replace `` with the _name_ of the database in addition to other props. For more details, refer to the [Spark documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). diff --git a/docs/connectors/jdbc/postgres.md b/docs/connectors/jdbc/postgres.md index 52670b197..54f560e91 100644 --- a/docs/connectors/jdbc/postgres.md +++ b/docs/connectors/jdbc/postgres.md @@ -1,6 +1,6 @@ # Postgres -## JSON Settings for reading data from Postgres database: +## JSON Settings For Reading Data From Postgres Database: ```json "data" : [{ @@ -16,4 +16,4 @@ }], ``` -Please replace \ with the name of the database in addition to other props. For more details, refer to the [spark documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). +Replace `` with the _name_ of the database in addition to other props. For more details, refer to the [Spark documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). diff --git a/docs/dataSourcesAndSinks/bigquery.md b/docs/dataSourcesAndSinks/bigquery.md index f14201d0e..b66230896 100644 --- a/docs/dataSourcesAndSinks/bigquery.md +++ b/docs/dataSourcesAndSinks/bigquery.md @@ -14,7 +14,7 @@ In addition, the following property needs to be set spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem ``` -If Zingg is run from outside the Google cloud, it requires authentication, please set the following env variable to the location of the file containing the service account key. A service account key can be created and downloaded in JSON format from the [Google Cloud console](https://cloud.google.com/docs/authentication/getting-started). +If Zingg is run from outside the Google cloud, it requires authentication, please set the following _environment variable_ to the location of the file containing the _service account key_. A service account key can be created and downloaded in JSON format from the [Google Cloud console](https://cloud.google.com/docs/authentication/getting-started). ```bash export GOOGLE_APPLICATION_CREDENTIALS=path to google service account key file @@ -24,7 +24,7 @@ Connection properties for BigQuery as a data source and data sink are given belo ## Properties for reading data from BigQuery: -The property **"credentialsFile"** should point to the google service account key file location. This is the same path that is used to set variable **GOOGLE\_APPLICATION\_CREDENTIALS**. The **"table"** property should point to a BigQuery table that contains source data. The property **"viewsEnabled"** must be set to true only. +The property `credentialsFile` should point to the Google service account key file location. This is the same path that is used to set variable `GOOGLE_APPLICATION_CREDENTIALS`. The `table` property should point to a BigQuery table that contains source data. The property `viewsEnabled` must be set to **true** only. ```json "data" : [{ @@ -38,9 +38,9 @@ The property **"credentialsFile"** should point to the google service account ke }], ``` -## Properties for writing data to BigQuery: +## Properties For Writing Data To BigQuery: -To write to BigQuery, a bucket needs to be created and assigned to the **"temporaryGcsBucket"** property. +To write to BigQuery, a bucket needs to be created and assigned to the `temporaryGcsBucket` property. ```json "output" : [{ @@ -57,7 +57,7 @@ To write to BigQuery, a bucket needs to be created and assigned to the **"tempor ## Notes: * The library **"gcs-connector-hadoop2-latest.jar"** can be downloaded from [Google](https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar) and the library **"spark-bigquery-with-dependencies\_2.12-0.24.2"** from the [maven repo](https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies\_2.12/0.24.2/spark-bigquery-with-dependencies\_2.12-0.24.2.jar). -* A typical service account key file looks like the below. The format of the file is JSON. +* A typical service account key file looks like below (JSON). ```json { diff --git a/docs/dataSourcesAndSinks/connectors.md b/docs/dataSourcesAndSinks/connectors.md index 90ea98923..5de25d90a 100644 --- a/docs/dataSourcesAndSinks/connectors.md +++ b/docs/dataSourcesAndSinks/connectors.md @@ -6,9 +6,10 @@ has_children: true # Data Sources and Sinks -Zingg connects, reads, and writes to most on-premise and cloud data sources. - -Zingg can read and write to Databricks, Snowflake, Cassandra, S3, Azure, Elastic, Exasol, major RDBMS, and any Spark-supported data sources. Zingg also works with all major file formats like Parquet, Avro, JSON, XLSX, CSV, TSV, etc. This is done through the Zingg [pipe](pipes.md) abstraction. +Zingg _connects, reads,_ and _writes_ to most on-premise and cloud data sources. +Zingg can read and write to **Databricks, Snowflake, Cassandra, S3, Azure, Elastic, Exasol**, major **RDBMS**, and any **Spark**-supported data sources. \ +\ +Zingg also works with all major file formats like Parquet, Avro, JSON, XLSX, CSV, TSV, etc. This is done through the Zingg [Pipe](pipes.md) abstraction. ![](../../assets/zinggOSS.png) diff --git a/docs/dataSourcesAndSinks/databricks.md b/docs/dataSourcesAndSinks/databricks.md index 2062ad842..d5d406d90 100644 --- a/docs/dataSourcesAndSinks/databricks.md +++ b/docs/dataSourcesAndSinks/databricks.md @@ -1,5 +1,5 @@ # Databricks -As a Spark based application, Zingg Open Source works seamlessly on Databricks. Zingg leverages Databricks Spark environment, and can access all the supported data sources like parquet and the delta file format. +As a Spark-based application, Zingg Open Source works seamlessly on Databricks. Zingg leverages Databricks' Spark environment, and can access all the supported data sources like parquet and the delta file format. -Please check the various ways in which you can run Zingg On Databricks [here](../running/databricks.md) +Please check the various ways in which you can run Zingg on Databricks [here](../running/databricks.md) diff --git a/docs/dataSourcesAndSinks/exasol.md b/docs/dataSourcesAndSinks/exasol.md index f1d471ef4..215b540ff 100644 --- a/docs/dataSourcesAndSinks/exasol.md +++ b/docs/dataSourcesAndSinks/exasol.md @@ -26,7 +26,7 @@ For example: spark.jars=spark-connector_2.12-1.3.0-spark-3.3.2-assembly.jar ``` -If there are more than one jar files, please use comma as separator. Additionally, please change the version accordingly so that it matches your Zingg and Spark versions. +If there are more than one jar files, please use _comma_ as separator. Additionally, please change the version accordingly so that it matches your Zingg and Spark versions. ## Connector Settings @@ -52,28 +52,28 @@ For example: ... ``` - Similarly, for output: +Similarly, for output: - ```json - ... +```json +... "output": [ - { - "name": "output", - "format": "com.exasol.spark", - "props": { - "host": "10.11.0.2", - "port": "8563", - "username": "sys", - "password": "exasol", - "create_table": "true", - "table": "DB_SCHEMA.ENTITY_RESOLUTION", - }, - "mode": "Append" - } + { + "name": "output", + "format": "com.exasol.spark", + "props": { + "host": "10.11.0.2", + "port": "8563", + "username": "sys", + "password": "exasol", + "create_table": "true", + "table": "DB_SCHEMA.ENTITY_RESOLUTION", + }, + "mode": "Append" + } ], ... ``` -Please note that, the `host` parameter should be the first internal node's IPv4 address. +Please note that, the `host` parameter should be the first internal node's **IPv4** **address**. -As Zingg uses [Exasol Spark connector](https://github.com/exasol/spark-connector) underneath, please also check out the [user guide](https://github.com/exasol/spark-connector/blob/main/doc/user_guide/user_guide.md) and [configuration options](https://github.com/exasol/spark-connector/blob/main/doc/user_guide/user_guide.md#configuration-options) for more information. +As Zingg uses [Exasol Spark connector](https://github.com/exasol/spark-connector) underneath, please also check out the [user guide](https://github.com/exasol/spark-connector/blob/main/doc/user\_guide/user\_guide.md) and [configuration options](https://github.com/exasol/spark-connector/blob/main/doc/user\_guide/user\_guide.md#configuration-options) for more information. diff --git a/docs/dataSourcesAndSinks/jdbc.md b/docs/dataSourcesAndSinks/jdbc.md index cbd25c220..d86f21449 100644 --- a/docs/dataSourcesAndSinks/jdbc.md +++ b/docs/dataSourcesAndSinks/jdbc.md @@ -1,6 +1,6 @@ -# Jdbc +# JDBC -Zingg can connect to various databases such as Mysql, DB2, MariaDB, MS SQL, Oracle, PostgreSQL, etc. using JDBC. One just needs to download the appropriate driver and made it accessible to the application. +Zingg can connect to various databases such as MySQL, DB2, MariaDB, MS SQL, Oracle, PostgreSQL, etc. using JDBC. One just needs to download the appropriate driver and made it accessible to the application. To include the JDBC driver for your particular database on the Spark classpath, please add the property **spark.jars** in [Zingg's runtime properties.](../stepbystep/zingg-runtime-properties.md) @@ -8,5 +8,4 @@ To include the JDBC driver for your particular database on the Spark classpath, spark.jars= ``` -Connection details are given in the following sections for a few common JDBC sources. - +Connection details are given in the following sections for a few common JDBC sources. diff --git a/docs/dataSourcesAndSinks/pipes.md b/docs/dataSourcesAndSinks/pipes.md index c928c01f0..a786b639d 100644 --- a/docs/dataSourcesAndSinks/pipes.md +++ b/docs/dataSourcesAndSinks/pipes.md @@ -6,9 +6,9 @@ nav_order: 4 # Zingg Pipes -Zingg Pipes are an abstraction for a data source from which Zingg fetches data for matching or to which Zingg writes its output. This lets users connect to literally any datastore that has a Spark connector. +Zingg Pipes are an _abstraction_ for a data source from which Zingg fetches data for matching or to which Zingg writes its output. This lets users connect to literally any datastore that has a Spark connector. -The pipe is an easy way to specify properties and formats for the Spark connector of the relevant data source. Zingg pipes can be configured through the config [JSON](../stepbystep/configuration/) passed to the program by outlining the datastore connection properties. +The pipe is an easy way to specify _properties_ and _formats_ for the Spark connector of the relevant data source. Zingg pipes can be configured through the config [JSON](../stepbystep/configuration/) passed to the program by outlining the datastore connection properties. Pipes can be configured for the data or the output attributes on the [JSON](../stepbystep/configuration/). @@ -20,10 +20,10 @@ A unique name to identify the data store. ## format -One of the Spark-supported connector formats. jdbc/avro/parquet etc. +One of the Spark-supported connector formats - jdbc/avro/parquet etc. ## options -Properties to be passed to spark.read and spark.write. +Properties to be passed to **spark.read** and **spark.write.** Let us look at some common data sources and their configurations. diff --git a/docs/dataSourcesAndSinks/snowflake.md b/docs/dataSourcesAndSinks/snowflake.md index e5de99940..0bad2c0ac 100644 --- a/docs/dataSourcesAndSinks/snowflake.md +++ b/docs/dataSourcesAndSinks/snowflake.md @@ -28,7 +28,7 @@ The config value for the data and output attributes of the JSON is: } ] ``` -One must include Snowflake JDBC driver and Spark dependency on the classpath. The jars can be downloaded from the maven repository ([1](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc), [2](https://mvnrepository.com/artifact/net.snowflake/spark-snowflake)). +One must include Snowflake JDBC driver and Spark dependency on the classpath. The jars can be downloaded from the **maven** repository ([1](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc), [2](https://mvnrepository.com/artifact/net.snowflake/spark-snowflake)). ``` spark.jars=snowflake-jdbc-3.13.18.jar,spark-snowflake_2.12-2.10.0-spark_3.1.jar @@ -36,4 +36,4 @@ spark.jars=snowflake-jdbc-3.13.18.jar,spark-snowflake_2.12-2.10.0-spark_3.1.jar For Zingg to discover the Snowflake jars, please add the property **spark.jars** in [Zingg's runtime properties.](../stepbystep/zingg-runtime-properties.md) -If you are looking for a native-run on Snowflake without using Spark, check [Zingg Enterprise Snowflake](https://www.zingg.ai/company/zingg-enterprise-snowflake). +If you are looking for a native-run on Snowflake without using Spark, check [Zingg Enterprise Snowflake](https://www.zingg.ai/company/zingg-enterprise-snowflake). diff --git a/docs/deterministicMatching.md b/docs/deterministicMatching.md new file mode 100644 index 000000000..a0cba2906 --- /dev/null +++ b/docs/deterministicMatching.md @@ -0,0 +1,42 @@ +# Deterministic Matching + +### Deterministic Matching - _Zingg Enterprise Feature_ + +Zingg Enterprise allows the ability to plug rule-based deterministic matching along with already Zingg AI's probabilistic matching. If the data contains _sure_ identifiers like emails, SSNs, passport-ids etc, we can use these attributes to resolve records.\ +\ +The deterministic matching flow is weaved into Zingg's flow to ensure that each record which has a match finds one, probabilistically, deterministically or both. If the data has known identifiers, Zingg Enterprise's deterministic matching highly improves both matching accuracy and performance. + +### Example For Configuring In JSON: + +```json + "deterministicMatching":[ + { + "matchCondition":[{"fieldName":"fname"},{"fieldName":"stNo"},{"fieldName":"add1"}] + }, + { + "matchCondition":[{"fieldName":"fname"},{"fieldName":"dob"},{"fieldName":"ssn"}] + }, + { + "matchCondition":[{"fieldName":"fname"},{"fieldName":"email"}] + } + ] +``` + +#### Python Code Example: + +```{python} +detMatchNameAdd = DeterministicMatching('fname','stNo','add1') +detMatchNameDobSsn = DeterministicMatching('fname','dob','ssn') +detMatchNameEmail = DeterministicMatching('fname','email') +args.setDeterministicMatchingCondition(detMatchNameAdd,detMatchNameDobSsn,detMatchNameEmail) +``` + +#### How Will It Work: + +The above conditions would translate into the following: + +1. Those rows which have **exactly** same `fname`, `stNo` and `add1` => exact match with max score 1\ + _OR_ +2. Those rows which have **exactly** same `fname`, `dob` and `ssn` => exact match with max score 1\ + _OR_ +3. Those rows which have **exactly** same `fname` and `email` => exact match with max score 1 diff --git a/docs/generatingdocumentation.md b/docs/generatingdocumentation.md index 90f7e21d9..ddac962ae 100644 --- a/docs/generatingdocumentation.md +++ b/docs/generatingdocumentation.md @@ -1,12 +1,11 @@ # Documenting The Model -Zingg generates readable documentation about the training data, including those marked as matches, as well as non-matches. The documentation is written to the zinggDir/modelId folder and can be built using the following +Zingg generates readable documentation about the training data, including those marked as matches, as well as non-matches. The documentation is written to the **zinggDir/modelId** folder and can be built using the following ``` ./scripts/zingg.sh --phase generateDocs --conf ``` -The generated documentation file can be viewed in a browser and looks like as below. +The generated documentation file can be viewed in a browser and looks like as below: ![Training Data](../assets/documentation1.png) - diff --git a/docs/python/doctrees/environment.pickle b/docs/python/doctrees/environment.pickle new file mode 100644 index 000000000..60da21b6d Binary files /dev/null and b/docs/python/doctrees/environment.pickle differ diff --git a/docs/python/doctrees/index.doctree b/docs/python/doctrees/index.doctree new file mode 100644 index 000000000..399dbb7c4 Binary files /dev/null and b/docs/python/doctrees/index.doctree differ diff --git a/docs/python/doctrees/zingg.doctree b/docs/python/doctrees/zingg.doctree new file mode 100644 index 000000000..4d5ef1240 Binary files /dev/null and b/docs/python/doctrees/zingg.doctree differ diff --git a/docs/python/markdown/index.md b/docs/python/markdown/index.md new file mode 100644 index 000000000..9f14d1aaa --- /dev/null +++ b/docs/python/markdown/index.md @@ -0,0 +1,179 @@ +# Python API + +## Zingg Entity Resolution Python Package + +Zingg Python APIs for entity resolution, identity resolution, record linkage, data mastering and deduplication using ML ([https://www.zingg.ai](https://www.zingg.ai)) + +**NOTE** + +Requires **python 3.6+**; **spark 3.5.0.** Otherwise, [`zingg.client.Zingg()`](zingg.md#zingg.client.Zingg) cannot be executed + +* [Zingg Entity Resolution Package](zingg.md) + * [zingg.client](zingg.md#zingg-client) + * [`Arguments`](zingg.md#zingg.client.Arguments) + * [`Arguments.copyArgs()`](zingg.md#zingg.client.Arguments.copyArgs) + * [`Arguments.createArgumentsFromJSON()`](zingg.md#zingg.client.Arguments.createArgumentsFromJSON) + * [`Arguments.createArgumentsFromJSONString()`](zingg.md#zingg.client.Arguments.createArgumentsFromJSONString) + * [`Arguments.getArgs()`](zingg.md#zingg.client.Arguments.getArgs) + * [`Arguments.getModelId()`](zingg.md#zingg.client.Arguments.getModelId) + * [`Arguments.getZinggBaseModelDir()`](zingg.md#zingg.client.Arguments.getZinggBaseModelDir) + * [`Arguments.getZinggBaseTrainingDataDir()`](zingg.md#zingg.client.Arguments.getZinggBaseTrainingDataDir) + * [`Arguments.getZinggModelDir()`](zingg.md#zingg.client.Arguments.getZinggModelDir) + * [`Arguments.getZinggTrainingDataMarkedDir()`](zingg.md#zingg.client.Arguments.getZinggTrainingDataMarkedDir) + * [`Arguments.getZinggTrainingDataUnmarkedDir()`](zingg.md#zingg.client.Arguments.getZinggTrainingDataUnmarkedDir) + * [`Arguments.setArgs()`](zingg.md#zingg.client.Arguments.setArgs) + * [`Arguments.setColumn()`](zingg.md#zingg.client.Arguments.setColumn) + * [`Arguments.setData()`](zingg.md#zingg.client.Arguments.setData) + * [`Arguments.setFieldDefinition()`](zingg.md#zingg.client.Arguments.setFieldDefinition) + * [`Arguments.setLabelDataSampleSize()`](zingg.md#zingg.client.Arguments.setLabelDataSampleSize) + * [`Arguments.setModelId()`](zingg.md#zingg.client.Arguments.setModelId) + * [`Arguments.setNumPartitions()`](zingg.md#zingg.client.Arguments.setNumPartitions) + * [`Arguments.setOutput()`](zingg.md#zingg.client.Arguments.setOutput) + * [`Arguments.setStopWordsCutoff()`](zingg.md#zingg.client.Arguments.setStopWordsCutoff) + * [`Arguments.setTrainingSamples()`](zingg.md#zingg.client.Arguments.setTrainingSamples) + * [`Arguments.setZinggDir()`](zingg.md#zingg.client.Arguments.setZinggDir) + * [`Arguments.writeArgumentsToJSON()`](zingg.md#zingg.client.Arguments.writeArgumentsToJSON) + * [`Arguments.writeArgumentsToJSONString()`](zingg.md#zingg.client.Arguments.writeArgumentsToJSONString) + * [`ClientOptions`](zingg.md#zingg.client.ClientOptions) + * [`ClientOptions.COLUMN`](zingg.md#zingg.client.ClientOptions.COLUMN) + * [`ClientOptions.CONF`](zingg.md#zingg.client.ClientOptions.CONF) + * [`ClientOptions.EMAIL`](zingg.md#zingg.client.ClientOptions.EMAIL) + * [`ClientOptions.LICENSE`](zingg.md#zingg.client.ClientOptions.LICENSE) + * [`ClientOptions.LOCATION`](zingg.md#zingg.client.ClientOptions.LOCATION) + * [`ClientOptions.MODEL_ID`](zingg.md#zingg.client.ClientOptions.MODEL\_ID) + * [`ClientOptions.PHASE`](zingg.md#zingg.client.ClientOptions.PHASE) + * [`ClientOptions.REMOTE`](zingg.md#zingg.client.ClientOptions.REMOTE) + * [`ClientOptions.ZINGG_DIR`](zingg.md#zingg.client.ClientOptions.ZINGG\_DIR) + * [`ClientOptions.getClientOptions()`](zingg.md#zingg.client.ClientOptions.getClientOptions) + * [`ClientOptions.getConf()`](zingg.md#zingg.client.ClientOptions.getConf) + * [`ClientOptions.getLocation()`](zingg.md#zingg.client.ClientOptions.getLocation) + * [`ClientOptions.getOptionValue()`](zingg.md#zingg.client.ClientOptions.getOptionValue) + * [`ClientOptions.getPhase()`](zingg.md#zingg.client.ClientOptions.getPhase) + * [`ClientOptions.hasLocation()`](zingg.md#zingg.client.ClientOptions.hasLocation) + * [`ClientOptions.setOptionValue()`](zingg.md#zingg.client.ClientOptions.setOptionValue) + * [`ClientOptions.setPhase()`](zingg.md#zingg.client.ClientOptions.setPhase) + * [`FieldDefinition`](zingg.md#zingg.client.FieldDefinition) + * [`FieldDefinition.getFieldDefinition()`](zingg.md#zingg.client.FieldDefinition.getFieldDefinition) + * [`FieldDefinition.setStopWords()`](zingg.md#zingg.client.FieldDefinition.setStopWords) + * [`FieldDefinition.stringify()`](zingg.md#zingg.client.FieldDefinition.stringify) + * [`Zingg`](zingg.md#zingg.client.Zingg) + * [`Zingg.execute()`](zingg.md#zingg.client.Zingg.execute) + * [`Zingg.executeLabel()`](zingg.md#zingg.client.Zingg.executeLabel) + * [`Zingg.executeLabelUpdate()`](zingg.md#zingg.client.Zingg.executeLabelUpdate) + * [`Zingg.getArguments()`](zingg.md#zingg.client.Zingg.getArguments) + * [`Zingg.getMarkedRecords()`](zingg.md#zingg.client.Zingg.getMarkedRecords) + * [`Zingg.getMarkedRecordsStat()`](zingg.md#zingg.client.Zingg.getMarkedRecordsStat) + * [`Zingg.getMatchedMarkedRecordsStat()`](zingg.md#zingg.client.Zingg.getMatchedMarkedRecordsStat) + * [`Zingg.getOptions()`](zingg.md#zingg.client.Zingg.getOptions) + * [`Zingg.getUnmarkedRecords()`](zingg.md#zingg.client.Zingg.getUnmarkedRecords) + * [`Zingg.getUnmatchedMarkedRecordsStat()`](zingg.md#zingg.client.Zingg.getUnmatchedMarkedRecordsStat) + * [`Zingg.getUnsureMarkedRecordsStat()`](zingg.md#zingg.client.Zingg.getUnsureMarkedRecordsStat) + * [`Zingg.init()`](zingg.md#zingg.client.Zingg.init) + * [`Zingg.initAndExecute()`](zingg.md#zingg.client.Zingg.initAndExecute) + * [`Zingg.processRecordsCli()`](zingg.md#zingg.client.Zingg.processRecordsCli) + * [`Zingg.processRecordsCliLabelUpdate()`](zingg.md#zingg.client.Zingg.processRecordsCliLabelUpdate) + * [`Zingg.setArguments()`](zingg.md#zingg.client.Zingg.setArguments) + * [`Zingg.setOptions()`](zingg.md#zingg.client.Zingg.setOptions) + * [`Zingg.writeLabelledOutput()`](zingg.md#zingg.client.Zingg.writeLabelledOutput) + * [`Zingg.writeLabelledOutputFromPandas()`](zingg.md#zingg.client.Zingg.writeLabelledOutputFromPandas) + * [`ZinggWithSpark`](zingg.md#zingg.client.ZinggWithSpark) + * [`getDfFromDs()`](zingg.md#zingg.client.getDfFromDs) + * [`getGateway()`](zingg.md#zingg.client.getGateway) + * [`getJVM()`](zingg.md#zingg.client.getJVM) + * [`getPandasDfFromDs()`](zingg.md#zingg.client.getPandasDfFromDs) + * [`getSparkContext()`](zingg.md#zingg.client.getSparkContext) + * [`getSparkSession()`](zingg.md#zingg.client.getSparkSession) + * [`getSqlContext()`](zingg.md#zingg.client.getSqlContext) + * [`initClient()`](zingg.md#zingg.client.initClient) + * [`initDataBricksConectClient()`](zingg.md#zingg.client.initDataBricksConectClient) + * [`initSparkClient()`](zingg.md#zingg.client.initSparkClient) + * [`parseArguments()`](zingg.md#zingg.client.parseArguments) + * [zingg.pipes](zingg.md#zingg-pipes) + * [`BigQueryPipe`](zingg.md#zingg.pipes.BigQueryPipe) + * [`BigQueryPipe.CREDENTIAL_FILE`](zingg.md#zingg.pipes.BigQueryPipe.CREDENTIAL\_FILE) + * [`BigQueryPipe.TABLE`](zingg.md#zingg.pipes.BigQueryPipe.TABLE) + * [`BigQueryPipe.TEMP_GCS_BUCKET`](zingg.md#zingg.pipes.BigQueryPipe.TEMP\_GCS\_BUCKET) + * [`BigQueryPipe.VIEWS_ENABLED`](zingg.md#zingg.pipes.BigQueryPipe.VIEWS\_ENABLED) + * [`BigQueryPipe.setCredentialFile()`](zingg.md#zingg.pipes.BigQueryPipe.setCredentialFile) + * [`BigQueryPipe.setTable()`](zingg.md#zingg.pipes.BigQueryPipe.setTable) + * [`BigQueryPipe.setTemporaryGcsBucket()`](zingg.md#zingg.pipes.BigQueryPipe.setTemporaryGcsBucket) + * [`BigQueryPipe.setViewsEnabled()`](zingg.md#zingg.pipes.BigQueryPipe.setViewsEnabled) + * [`CsvPipe`](zingg.md#zingg.pipes.CsvPipe) + * [`CsvPipe.setDelimiter()`](zingg.md#zingg.pipes.CsvPipe.setDelimiter) + * [`CsvPipe.setHeader()`](zingg.md#zingg.pipes.CsvPipe.setHeader) + * [`CsvPipe.setLocation()`](zingg.md#zingg.pipes.CsvPipe.setLocation) + * [`InMemoryPipe`](zingg.md#zingg.pipes.InMemoryPipe) + * [`InMemoryPipe.getDataset()`](zingg.md#zingg.pipes.InMemoryPipe.getDataset) + * [`InMemoryPipe.setDataset()`](zingg.md#zingg.pipes.InMemoryPipe.setDataset) + * [`Pipe`](zingg.md#zingg.pipes.Pipe) + * [`Pipe.addProperty()`](zingg.md#zingg.pipes.Pipe.addProperty) + * [`Pipe.getPipe()`](zingg.md#zingg.pipes.Pipe.getPipe) + * [`Pipe.setSchema()`](zingg.md#zingg.pipes.Pipe.setSchema) + * [`Pipe.toString()`](zingg.md#zingg.pipes.Pipe.toString) + * [`SnowflakePipe`](zingg.md#zingg.pipes.SnowflakePipe) + * [`SnowflakePipe.DATABASE`](zingg.md#zingg.pipes.SnowflakePipe.DATABASE) + * [`SnowflakePipe.DBTABLE`](zingg.md#zingg.pipes.SnowflakePipe.DBTABLE) + * [`SnowflakePipe.PASSWORD`](zingg.md#zingg.pipes.SnowflakePipe.PASSWORD) + * [`SnowflakePipe.SCHEMA`](zingg.md#zingg.pipes.SnowflakePipe.SCHEMA) + * [`SnowflakePipe.URL`](zingg.md#zingg.pipes.SnowflakePipe.URL) + * [`SnowflakePipe.USER`](zingg.md#zingg.pipes.SnowflakePipe.USER) + * [`SnowflakePipe.WAREHOUSE`](zingg.md#zingg.pipes.SnowflakePipe.WAREHOUSE) + * [`SnowflakePipe.setDatabase()`](zingg.md#zingg.pipes.SnowflakePipe.setDatabase) + * [`SnowflakePipe.setDbTable()`](zingg.md#zingg.pipes.SnowflakePipe.setDbTable) + * [`SnowflakePipe.setPassword()`](zingg.md#zingg.pipes.SnowflakePipe.setPassword) + * [`SnowflakePipe.setSFSchema()`](zingg.md#zingg.pipes.SnowflakePipe.setSFSchema) + * [`SnowflakePipe.setURL()`](zingg.md#zingg.pipes.SnowflakePipe.setURL) + * [`SnowflakePipe.setUser()`](zingg.md#zingg.pipes.SnowflakePipe.setUser) + * [`SnowflakePipe.setWarehouse()`](zingg.md#zingg.pipes.SnowflakePipe.setWarehouse) + +## API Reference + +* [Module Index](py-modindex.md) +* [Index](genindex.md) +* [Search Page](search.md) + +## Example API Usage + +```python +from zingg.client import * +from zingg.pipes import * + +#build the arguments for zingg +args = Arguments() +#set field definitions +fname = FieldDefinition("fname", "string", MatchType.FUZZY) +lname = FieldDefinition("lname", "string", MatchType.FUZZY) +stNo = FieldDefinition("stNo", "string", MatchType.FUZZY) +add1 = FieldDefinition("add1","string", MatchType.FUZZY) +add2 = FieldDefinition("add2", "string", MatchType.FUZZY) +city = FieldDefinition("city", "string", MatchType.FUZZY) +areacode = FieldDefinition("areacode", "string", MatchType.FUZZY) +state = FieldDefinition("state", "string", MatchType.FUZZY) +dob = FieldDefinition("dob", "string", MatchType.FUZZY) +ssn = FieldDefinition("ssn", "string", MatchType.FUZZY) + +fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] + +args.setFieldDefinition(fieldDefs) +#set the modelid and the zingg dir +args.setModelId("100") +args.setZinggDir("models") +args.setNumPartitions(4) +args.setLabelDataSampleSize(0.5) + +#reading dataset into inputPipe and settint it up in 'args' +#below line should not be required if you are reading from in memory dataset +#in that case, replace df with input df +schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" +inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema) +args.setData(inputPipe) +outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput") + +args.setOutput(outputPipe) + +options = ClientOptions([ClientOptions.PHASE,"match"]) + +#Zingg execution for the given phase +zingg = Zingg(args, options) +zingg.initAndExecute() +``` diff --git a/docs/python/markdown/zingg.md b/docs/python/markdown/zingg.md new file mode 100644 index 000000000..9467c6751 --- /dev/null +++ b/docs/python/markdown/zingg.md @@ -0,0 +1,785 @@ +# Zingg Entity Resolution Package + +Zingg Python APIs for entity resolution, record linkage, data mastering and deduplication using ML +([https://www.zingg.ai](https://www.zingg.ai)) + +requires python 3.6+; spark 3.5.0 +Otherwise, [`zingg.client.Zingg()`](#zingg.client.Zingg) cannot be executed + + + + + +## zingg.client + +This module is the main entry point of the Zingg Python API + +### *class* zingg.client.Arguments + +Bases: `object` + +This class helps supply match arguments to Zingg. There are 3 basic steps in any match process. + +* **Defining:** + specifying information about data location, fields, and our notion of similarity. +* **Training:** + making Zingg learn the matching rules +* **Matching:** + Running the models on the entire dataset + +#### copyArgs(phase) + +#### *static* createArgumentsFromJSON(fileName, phase) + +Method to create an object of this class from the JSON file and phase parameter value. + +* **Parameters:** + * **fileName** (*String*) – The CONF parameter value of ClientOption object + * **phase** (*String*) – The PHASE parameter value of ClientOption object +* **Returns:** + The pointer containing address of the this class object +* **Return type:** + pointer([Arguments](#zingg.client.Arguments)) + +#### *static* createArgumentsFromJSONString(jsonArgs, phase) + +#### getArgs() + +Method to get pointer address of this class + +* **Returns:** + The pointer containing the address of this class object +* **Return type:** + pointer([Arguments](#zingg.client.Arguments)) + +#### getModelId() + +#### getZinggBaseModelDir() + +#### getZinggBaseTrainingDataDir() + +Method to get the location of the folder where Zingg +saves the training data found by findTrainingData + +#### getZinggModelDir() + +#### getZinggTrainingDataMarkedDir() + +Method to get the location of the folder where Zingg +saves the marked training data labeled by the user + +#### getZinggTrainingDataUnmarkedDir() + +Method to get the location of the folder where Zingg +saves the training data found by findTrainingData + +#### setArgs(argumentsObj) + +Method to set this class object + +* **Parameters:** + **argumentsObj** (*pointer**(*[*Arguments*](#zingg.client.Arguments)*)*) – Argument object to set this object + +#### setColumn(column) + +Method to set stopWordsCutoff parameter value +By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property + +* **Parameters:** + **stopWordsCutoff** (*float*) – The stop words cutoff parameter value of ClientOption object or file address of json file + +#### setData(\*pipes) + +Method to set the file path of the file to be matched. + +* **Parameters:** + **pipes** ([*Pipe*](#zingg.pipes.Pipe)*[**]*) – input data pipes separated by comma e.g. (pipe1,pipe2,..) + +#### setFieldDefinition(fieldDef) + +Method convert python objects to java FieldDefinition objects and set the field definitions associated with this client + +* **Parameters:** + **fieldDef** (*List**(*[*FieldDefinition*](#zingg.client.FieldDefinition)*)*) – python FieldDefinition object list + +#### setLabelDataSampleSize(labelDataSampleSize) + +Method to set labelDataSampleSize parameter value +Set the fraction of data to be used from the complete data set to be used for seeding the labeled data Labelling is costly and we want a fast approximate way of looking at a small sample of the records and identifying expected matches and nonmatches + +* **Parameters:** + **labelDataSampleSize** (*float*) – value between 0.0 and 1.0 denoting portion of dataset to use in generating seed samples + +#### setModelId(id) + +Method to set the output directory where the match output will be saved + +* **Parameters:** + **id** (*String*) – model id value + +#### setNumPartitions(numPartitions) + +Method to set NumPartitions parameter value +Sample size to use for seeding labeled data We don’t want to run over all the data, as we want a quick way to seed some labeled data that we can manually edit + +* **Parameters:** + **numPartitions** (*int*) – number of partitions for given data pipes + +#### setOutput(\*pipes) + +Method to set the output directory where the match result will be saved + +* **Parameters:** + **pipes** ([*Pipe*](#zingg.pipes.Pipe)*[**]*) – output data pipes separated by comma e.g. (pipe1,pipe2,..) + +#### setStopWordsCutoff(stopWordsCutoff) + +Method to set stopWordsCutoff parameter value +By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property + +* **Parameters:** + **stopWordsCutoff** (*float*) – The stop words cutoff parameter value of ClientOption object or file address of json file + +#### setTrainingSamples(\*pipes) + +Method to set existing training samples to be matched. + +* **Parameters:** + **pipes** ([*Pipe*](#zingg.pipes.Pipe)*[**]*) – input training data pipes separated by comma e.g. (pipe1,pipe2,..) + +#### setZinggDir(f) + +Method to set the location for Zingg to save its internal computations and models. Please set it to a place where the program has to write access. + +* **Parameters:** + **f** (*String*) – Zingg directory name of the models + +#### writeArgumentsToJSON(fileName) + +Method to write JSON file from the object of this class + +* **Parameters:** + **fileName** (*String*) – The CONF parameter value of ClientOption object or file address of json file + +#### writeArgumentsToJSONString() + +Method to create an object of this class from the JSON file and phase parameter value. + +* **Parameters:** + * **fileName** (*String*) – The CONF parameter value of ClientOption object + * **phase** (*String*) – The PHASE parameter value of ClientOption object +* **Returns:** + The pointer containing address of the this class object +* **Return type:** + pointer([Arguments](#zingg.client.Arguments)) + +### *class* zingg.client.ClientOptions(argsSent=None) + +Bases: `object` + +Class that contains Client options for Zingg object +:param phase: trainMatch, train, match, link, findAndLabel, findTrainingData, recommend etc +:type phase: String +:param args: Parse a list of Zingg command line options parameter values e.g. “–location” etc. optional argument for initializing this class. +:type args: List(String) or None + +#### COLUMN *= * + +Column whose stop words are to be recommended through Zingg + +* **Type:** + COLUMN + +#### CONF *= * + +conf parameter for this class + +* **Type:** + CONF + +#### EMAIL *= * + +e-mail parameter for this class + +* **Type:** + EMAIL + +#### LICENSE *= * + +license parameter for this class + +* **Type:** + LICENSE + +#### LOCATION *= * + +location parameter for this class + +* **Type:** + LOCATION + +#### MODEL_ID *= * + +ZINGG_DIR/MODEL_ID is used to save the model + +* **Type:** + MODEL_ID + +#### PHASE *= * + +phase parameter for this class + +* **Type:** + PHASE + +#### REMOTE *= * + +remote option used internally for running on Databricks + +* **Type:** + REMOTE + +#### ZINGG_DIR *= * + +location where Zingg saves the model, training data etc + +* **Type:** + ZINGG_DIR + +#### getClientOptions() + +Method to get pointer address of this class + +* **Returns:** + The pointer containing address of the this class object +* **Return type:** + pointer([ClientOptions](#zingg.client.ClientOptions)) + +#### getConf() + +Method to get CONF value + +* **Returns:** + The CONF parameter value +* **Return type:** + String + +#### getLocation() + +Method to get LOCATION value + +* **Returns:** + The LOCATION parameter value +* **Return type:** + String + +#### getOptionValue(option) + +Method to get value for the key option + +* **Parameters:** + **option** (*String*) – key to geting the value +* **Returns:** + The value which is mapped for given key +* **Return type:** + String + +#### getPhase() + +Method to get PHASE value + +* **Returns:** + The PHASE parameter value +* **Return type:** + String + +#### hasLocation() + +Method to check if this class has LOCATION parameter set as None or not + +* **Returns:** + The boolean value if LOCATION parameter is present or not +* **Return type:** + Bool + +#### setOptionValue(option, value) + +Method to map option key to the given value + +* **Parameters:** + * **option** (*String*) – key that is mapped with value + * **value** (*String*) – value to be set for given key + +#### setPhase(newValue) + +Method to set PHASE value + +* **Parameters:** + **newValue** (*String*) – name of the phase +* **Returns:** + The pointer containing address of the this class object after seting phase +* **Return type:** + pointer([ClientOptions](#zingg.client.ClientOptions)) + +### *class* zingg.client.FieldDefinition(name, dataType, \*matchType) + +Bases: `object` + +This class defines each field that we use in matching We can use this to configure the properties of each field we use for matching in Zingg. + +* **Parameters:** + * **name** (*String*) – name of the field + * **dataType** (*String*) – type of the data e.g. string, float, etc. + * **matchType** (*MatchType*) – match type of this field e.g. FUSSY, EXACT, etc. + +#### getFieldDefinition() + +Method to get pointer address of this class + +* **Returns:** + The pointer containing the address of this class object +* **Return type:** + pointer([FieldDefinition](#zingg.client.FieldDefinition)) + +#### setStopWords(stopWords) + +Method to add stopwords to this class object + +* **Parameters:** + **stopWords** (*String*) – The stop Words containing csv file’s location + +#### stringify(str) + +Method to stringify’ed the dataType before it is set in FieldDefinition object + +* **Parameters:** + **str** (*String*) – dataType of the FieldDefinition +* **Returns:** + The stringify’ed value of the dataType +* **Return type:** + String + +### *class* zingg.client.Zingg(args, options) + +Bases: `object` + +This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + +* **Parameters:** + * **args** ([*Arguments*](#zingg.client.Arguments)) – arguments for training and matching + * **options** ([*ClientOptions*](#zingg.client.ClientOptions)) – client option for this class object + +#### execute() + +Method to execute this class object + +#### executeLabel() + +Method to run label phase + +#### executeLabelUpdate() + +Method to run label update phase + +#### getArguments() + +Method to get atguments of this class object + +* **Returns:** + The pointer containing address of the Arguments object of this class object +* **Return type:** + pointer([Arguments](#zingg.client.Arguments)) + +#### getMarkedRecords() + +Method to get marked record dataset from the inputpipe + +* **Returns:** + spark dataset containing marked records +* **Return type:** + Dataset + +#### getMarkedRecordsStat(markedRecords, value) + +Method to get No. of records that is marked + +* **Parameters:** + * **markedRecords** (*Dataset*) – spark dataset containing marked records + * **value** (*long*) – flag value to check if markedRecord is initially matched or not +* **Returns:** + The no. of marked records +* **Return type:** + int + +#### getMatchedMarkedRecordsStat() + +Method to get No. of records that are marked and matched + +* **Returns:** + The bo. of matched marked records +* **Return type:** + int + +#### getOptions() + +Method to get client options of this class object + +* **Returns:** + The pointer containing the address of the ClientOptions object of this class object +* **Return type:** + pointer([ClientOptions](#zingg.client.ClientOptions)) + +#### getUnmarkedRecords() + +Method to get unmarked record dataset from the inputpipe + +* **Returns:** + spark dataset containing unmarked records +* **Return type:** + Dataset + +#### getUnmatchedMarkedRecordsStat() + +Method to get No. of records that are marked and unmatched + +* **Returns:** + The no. of unmatched marked records +* **Return type:** + int + +#### getUnsureMarkedRecordsStat() + +Method to get No. of records that are marked and Not Sure if its matched or not + +* **Returns:** + The no. of Not Sure marked records +* **Return type:** + int + +#### init() + +Method to initialize zingg client by reading internal configurations and functions + +#### initAndExecute() + +Method to run both init and execute methods consecutively + +#### processRecordsCli(unmarkedRecords, args) + +Method to get user input on unmarked records + +* **Returns:** + spark dataset containing updated records +* **Return type:** + Dataset + +#### processRecordsCliLabelUpdate(lines, args) + +#### setArguments(args) + +Method to set Arguments + +* **Parameters:** + **args** ([*Arguments*](#zingg.client.Arguments)) – provide arguments for this class object + +#### setOptions(options) + +Method to set atguments of this class object + +* **Parameters:** + **options** ([*ClientOptions*](#zingg.client.ClientOptions)) – provide client options for this class object +* **Returns:** + The pointer containing address of the ClientOptions object of this class object +* **Return type:** + pointer([ClientOptions](#zingg.client.ClientOptions)) + +#### writeLabelledOutput(updatedRecords, args) + +Method to write updated records after user input + +#### writeLabelledOutputFromPandas(candidate_pairs_pd, args) + +Method to write updated records (as pandas df) after user input + +### *class* zingg.client.ZinggWithSpark(args, options) + +Bases: [`Zingg`](#zingg.client.Zingg) + +This class is the main point of interface with the Zingg matching product. Construct a client to Zingg using provided arguments and spark master. If running locally, set the master to local. + +* **Parameters:** + * **args** ([*Arguments*](#zingg.client.Arguments)) – arguments for training and matching + * **options** ([*ClientOptions*](#zingg.client.ClientOptions)) – client option for this class object + +### zingg.client.getDfFromDs(data) + +Method to convert spark dataset to dataframe + +* **Parameters:** + **data** (*DataSet*) – provide spark dataset +* **Returns:** + converted spark dataframe +* **Return type:** + DataFrame + +### zingg.client.getGateway() + +### zingg.client.getJVM() + +### zingg.client.getPandasDfFromDs(data) + +Method to convert spark dataset to pandas dataframe + +* **Parameters:** + **data** (*DataSet*) – provide spark dataset +* **Returns:** + converted pandas dataframe +* **Return type:** + DataFrame + +### zingg.client.getSparkContext() + +### zingg.client.getSparkSession() + +### zingg.client.getSqlContext() + +### zingg.client.initClient() + +### zingg.client.initDataBricksConectClient() + +### zingg.client.initSparkClient() + +### zingg.client.parseArguments(argv) + +This method is used for checking mandatory arguments and creating an arguments list from Command line arguments + +* **Parameters:** + **argv** (*List*) – Values that are passed during the calling of the program along with the calling statement. +* **Returns:** + a list containing necessary arguments to run any phase +* **Return type:** + List + + + +## zingg.pipes + +This module is submodule of zingg to work with different types of Pipes. Classes of this module inherit the Pipe class, and use that class to create many different types of pipes. + +### *class* zingg.pipes.BigQueryPipe(name) + +Bases: [`Pipe`](#zingg.pipes.Pipe) + +Pipe Class for working with BigQuery pipeline + +* **Parameters:** + **name** (*String*) – name of the pipe. + +#### CREDENTIAL_FILE *= 'credentialsFile'* + +#### TABLE *= 'table'* + +#### TEMP_GCS_BUCKET *= 'temporaryGcsBucket'* + +#### VIEWS_ENABLED *= 'viewsEnabled'* + +#### setCredentialFile(file) + +Method to set Credential file to the pipe + +* **Parameters:** + **file** (*String*) – credential file name + +#### setTable(table) + +Method to set Table to the pipe + +* **Parameters:** + **table** (*String*) – provide table parameter + +#### setTemporaryGcsBucket(bucket) + +Method to set TemporaryGcsBucket to the pipe + +* **Parameters:** + **bucket** (*String*) – provide bucket parameter + +#### setViewsEnabled(isEnabled) + +Method to set if viewsEnabled parameter is Enabled or not + +* **Parameters:** + **isEnabled** (*Bool*) – provide boolean parameter which defines if viewsEnabled option is enable or not + +### *class* zingg.pipes.CsvPipe(name, location=None, schema=None) + +Bases: [`Pipe`](#zingg.pipes.Pipe) + +Class CsvPipe: used for working with text files which uses a pipe symbol to separate units of text that belong in different columns. + +* **Parameters:** + * **name** (*String*) – name of the pipe. + * **location** (*String* *or* *None*) – (optional) location from where we read data + * **schema** (*Schema* *or* *None*) – (optional) json schema for the pipe + +#### setDelimiter(delimiter) + +This method is used to define delimiter of CsvPipe + +* **Parameters:** + **delimiter** (*String*) – a sequence of one or more characters for specifying the boundary between separate, independent regions in data streams + +#### setHeader(header) + +Method to set header property of pipe + +* **Parameters:** + **header** (*Boolean*) – true if pipe have header, false otherwise + +#### setLocation(location) + +Method to set location of pipe + +* **Parameters:** + **location** (*String*) – location from where we read data + +### *class* zingg.pipes.InMemoryPipe(name, df=None) + +Bases: [`Pipe`](#zingg.pipes.Pipe) + +Pipe Class for working with InMemory pipeline + +* **Parameters:** + * **name** (*String*) – name of the pipe + * **df** (*Dataset* *or* *None*) – provide dataset for this pipe (optional) + +#### getDataset() + +Method to get Dataset from pipe + +* **Returns:** + dataset of the pipe in the format of spark dataset +* **Return type:** + Dataset + +#### setDataset(df) + +Method to set DataFrame of the pipe + +* **Parameters:** + **df** (*DataFrame*) – pandas or spark dataframe for the pipe + +### *class* zingg.pipes.Pipe(name, format) + +Bases: `object` + +Pipe class for working with different data-pipelines. Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries, etc + +* **Parameters:** + * **name** (*String*) – name of the pipe + * **format** (*Format*) – formate of pipe e.g. bigquery,InMemory, etc. + +#### addProperty(name, value) + +Method for adding different properties of pipe + +* **Parameters:** + * **name** (*String*) – name of the property + * **value** (*String*) – value you want to set for the property + +#### getPipe() + +Method to get Pipe + +* **Returns:** + pipe parameter values in the format of a list of string +* **Return type:** + [Pipe](#zingg.pipes.Pipe) + +#### setSchema(s) + +Method to set pipe schema value + +* **Parameters:** + **s** (*Schema*) – json schema for the pipe + +#### toString() + +Method to get pipe parameter values + +* **Returns:** + pipe information in list format +* **Return type:** + List[String] + +### *class* zingg.pipes.SnowflakePipe(name) + +Bases: [`Pipe`](#zingg.pipes.Pipe) + +Pipe Class for working with Snowflake pipeline + +* **Parameters:** + **name** (*String*) – name of the pipe + +#### DATABASE *= 'sfDatabase'* + +#### DBTABLE *= 'dbtable'* + +#### PASSWORD *= 'sfPassword'* + +#### SCHEMA *= 'sfSchema'* + +#### URL *= 'sfUrl'* + +#### USER *= 'sfUser'* + +#### WAREHOUSE *= 'sfWarehouse'* + +#### setDatabase(db) + +Method to set Database to the pipe + +* **Parameters:** + **db** (*Database*) – provide Database parameter. + +#### setDbTable(dbtable) + +description + +* **Parameters:** + **dbtable** (*String*) – provide bucket parameter. + +#### setPassword(passwd) + +Method to set Password to the pipe + +* **Parameters:** + **passwd** (*String*) – provide Password parameter. + +#### setSFSchema(schema) + +Method to set Schema to the pipe + +* **Parameters:** + **schema** (*Schema*) – provide schema parameter. + +#### setURL(url) + +Method to set url to the pipe + +* **Parameters:** + **url** (*String*) – provide url for this pipe + +#### setUser(user) + +Method to set User to the pipe + +* **Parameters:** + **user** (*String*) – provide User parameter. + +#### setWarehouse(warehouse) + +Method to set warehouse parameter to the pipe + +* **Parameters:** + **warehouse** (*String*) – provide warehouse parameter. diff --git a/docs/runIncremental.md b/docs/runIncremental.md new file mode 100644 index 000000000..cc7391d0b --- /dev/null +++ b/docs/runIncremental.md @@ -0,0 +1,110 @@ +--- +description: >- + Building a continuosly updated identity graph with new, updated and deleted + records +--- + +# Adding Incremental Data + +[Zingg Enterprise Feature](#user-content-fn-1)[^1] + +Rerunning matching on entire datasets is wasteful, and we lose the lineage of matched records against a persistent identifier. Using the[ incremental flow](https://www.learningfromdata.zingg.ai/p/zingg-incremental-flow) feature in [Zingg Enterprise](https://www.zingg.ai/company/zingg-enterprise), incremental loads can be run to match existing pre-resolved entities. The new and updated records are matched to existing clusters, and new persistent [**ZINGG\_IDs**](https://www.learningfromdata.zingg.ai/p/hello-zingg-id) are generated for records that do not find a match. If a record gets updated and Zingg Enterprise discovers that it is a more suitable match with another cluster, it will be reassigned. Cluster assignment, merge, and unmerge happens automatically in the flow. Zingg Enterprise also takes care of human feedback on previously matched data to ensure that it does not override the approved records. + +### The incremental phase is run as follows: + +`./scripts/zingg.sh --phase runIncremental --conf ` + +### Example incrementalConf.json: + +```json +{ + "config" : "config.json", + "incrementalData": [{ + "name":"customers_incr", + "format":"csv", + "props": { + "location": "test-incr.csv", + "delimiter": ",", + "header":false + }, + "schema": "recId string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string" + } + ] +} +``` + +### runIncremental can also be triggered using Python by invoking: + +`./scripts/zingg.sh --run examples/FebrlExample.py` + +#### Python Code Example: + +```{python} +#import the packages + +from zingg.client import * +from zingg.pipes import * +from zinggEC.enterprise.common.ApproverArguments import * +from zinggEC.enterprise.common.IncrementalArguments import * +from zinggEC.enterprise.common.epipes import * +from zinggEC.enterprise.common.EArguments import * +from zinggEC.enterprise.common.EFieldDefinition import EFieldDefinition +from zinggES.enterprise.spark.ESparkClient import EZingg +import os + +#build the arguments for zingg +args = EArguments() +#set field definitions +recId = EFieldDefinition("recId", "string", MatchType.DONT_USE) +recId.setPrimaryKey(True) +fname = EFieldDefinition("fname", "string", MatchType.FUZZY) +lname = EFieldDefinition("lname", "string", MatchType.FUZZY) +stNo = EFieldDefinition("stNo", "string", MatchType.FUZZY) +add1 = EFieldDefinition("add1","string", MatchType.FUZZY) +add2 = EFieldDefinition("add2", "string", MatchType.FUZZY) +city = EFieldDefinition("city", "string", MatchType.FUZZY) +areacode = EFieldDefinition("areacode", "string", MatchType.FUZZY) +state = EFieldDefinition("state", "string", MatchType.FUZZY) +dob = EFieldDefinition("dob", "string", MatchType.FUZZY) +ssn = EFieldDefinition("ssn", "string", MatchType.FUZZY) + +fieldDefs = [recId, fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] +args.setFieldDefinition(fieldDefs) +#set the modelid and the zingg dir +args.setModelId("100") +args.setZinggDir("/tmp/models") +args.setNumPartitions(4) +args.setLabelDataSampleSize(0.5) + +#reading dataset into inputPipe and settint it up in 'args' +schema = "recId string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" +inputPipe = ECsvPipe("testFebrl", "examples/febrl/test.csv", schema) +args.setData(inputPipe) + +outputPipe = ECsvPipe("resultFebrl", "/tmp/febrlOutput") +outputPipe.setHeader("true") + +args.setOutput(outputPipe) + +#Run findAndLabel +options = ClientOptions([ClientOptions.PHASE,"findAndLabel"]) +zingg = EZingg(args, options) +zingg.initAndExecute() + +#Run trainMatch after above completes +options = ClientOptions([ClientOptions.PHASE,"trainMatch"]) +zingg = EZingg(args, options) +zingg.initAndExecute() + +#Now run incremental on output generated above +incrArgs = IncrementalArguments() +incrArgs.setParentArgs(args) +incrPipe = ECsvPipe("testFebrlIncr", "examples/febrl/test-incr.csv", schema) +incrArgs.setIncrementalData(incrPipe) + +options = ClientOptions([ClientOptions.PHASE,"runIncremental"]) +zingg = EZingg(incrArgs, options) +zingg.initAndExecute() +``` + +[^1]: Zingg Enterprise is an advance version of Zingg Community with production grade features diff --git a/docs/running/aws.md b/docs/running/aws.md index 55801167a..358a53574 100644 --- a/docs/running/aws.md +++ b/docs/running/aws.md @@ -1,15 +1,19 @@ --- -layout: default title: Running on AWS parent: Running Zingg on Cloud nav_order: 5 --- -## Running on AWS Elastic Map Reduce -One option is to use the spark-submit option with the Zingg config and phase. +# Running On AWS - aws emr create-cluster --name "Add Spark Step Cluster" --release-label emr-6.2.0 --applications Name=Zingg \ - --ec2-attributes KeyName=myKey --instance-type --instance-count \ - --steps Type=Spark,Name="Zingg",ActionOnFailure=CONTINUE,Args=[--class,zingg.client.Client,,--phase,,--conf,] --use-default-roles```` +One option is to use the `spark-submit` option with the Zingg config and phase. Please note that the **config.json** should be available locally at the driver for Zingg to use it. + +````` +aws emr create-cluster --name "Add Spark Step Cluster" --release-label emr-6.2.0 --applications Name=Zingg \ +--ec2-attributes KeyName=myKey --instance-type --instance-count \ +--steps Type=Spark,Name="Zingg",ActionOnFailure=CONTINUE,Args=[--class,zingg.spark.client.SparkClient,,--phase,,--conf,] --use-default-roles```` +````` + +A step-by-step is provided [here](https://blog.infostrux.com/identity-resolution-with-zingg-ai-snowflake-and-aws-emr-for-the-canadian-football-league-22cf0850ab53). The guide mentions training locally using Zingg Docker, but the **findTrainingData** and **label** phases can be executed on EMR directly. A second option is to run Zingg Python code in [AWS EMR Notebooks](https://aws.amazon.com/emr/features/notebooks/) diff --git a/docs/scoring.md b/docs/scoring.md index aa52a8764..b3bc8790a 100644 --- a/docs/scoring.md +++ b/docs/scoring.md @@ -2,13 +2,13 @@ nav_order: 6 --- -# Output Scores +# Interpreting Output Scores -For each field(FNAME, NAME, EMAIL..), Zingg computes multiple features and feeds them to a classifier. These features are typically different ways to compare strings. We consider string lengths and their differences, character differences, and which characters actually differed. +For each field(**FNAME, NAME, EMAIL** ..), Zingg computes multiple features and feeds them to a classifier. These features are typically different ways to compare strings. We consider string lengths and their differences, character differences, and which characters _actually_ differed. -* The shorter string pair ABCD and ABCE will be less similar than ABCDEF and ABCEEF. +* The shorter string pair ABCD and ABCE will be _less similar_ than ABCDEF and ABCEEF. * Common typos, for e.g. m instead of n, are penalized less severely and will be scored higher than a word with a replaced by non-obvious character b. -* The more changes you need to make to the strings to make them match, the less similar they will be. +* The _more changes_ you need to make to the strings to make them match, the less similar they will be. * Differences in the middle are penalized more than prefixes and suffixes. No individual feature is perfect, but the whole is greater than the sum of its parts. That's where the strength of Zingg matching and the accuracy comes from. @@ -19,6 +19,6 @@ The threshold is automatically chosen so that you can pick up most of the result A few things to keep in mind while interpreting the scores: -* Matching is transitive, so if record A matches Record B and Record B matches C, we put records A, B, and C in the same cluster. That is how different records in a cluster get matched at various confidence levels to the rest and all these records showed up together. -* Our recommendation is to keep a threshold below x% of the max score as suspect records - manual review/ to be visited later/flow through but will less confidence. x will depend on how accurate you find the results and how much you want to control the outcome. -* Keep cluster size above 4 or 5 for inspection. You could keep it irrespective of the score, or look at only those clusters whose z\_minScore is 0. This will depend on what the results look like to you. +* Matching is _transitive_, so if record A matches Record B and Record B matches C, we put records A, B, and C in the same cluster. That is how different records in a cluster get matched at various confidence levels to the rest and all these records showed up together. +* Our recommendation is to keep a threshold below **x**% of the max score as suspect records - manual review/to be visited later/flow through but will lessen confidence. **x** will depend on how accurate you find the results and how much you want to control the outcome. +* Keep cluster size above 4 or 5 for inspection. You could keep it irrespective of the score, or look at only those clusters whose **z\_minScore** is 0. This will depend on what the results look like to you. diff --git a/docs/settingUpZingg.md b/docs/settingUpZingg.md index 9bfef4453..b91a9d454 100644 --- a/docs/settingUpZingg.md +++ b/docs/settingUpZingg.md @@ -1,59 +1,57 @@ -# Setting Zingg Development Environment +# Setting Up Zingg Development Environment -The following steps will help you set up the Zingg Development Environment. While the steps remain the same across different OS, we have provided detailed instructions for Ubuntu OS. Below examples have been created using Ubuntu 22.04.2 LTS +The following steps will help you set up the Zingg Development Environment. While the steps remain the same across different OS, we have provided detailed instructions for Ubuntu OS. \ +\ +The below steps have been created using Ubuntu 22.04.2 LTS -Make sure to update your ubutu installation +Make sure to update your Ubuntu installation: -sudo apt update +`sudo apt update` -**** +**Step 0: Install Ubuntu on WSL2 on Windows** -**Step 0 : Install Ubuntu on WSL2 on Windows** +* Install **wsl**: Type the following command in **Windows PowerShell**. -* Install wsl: Type the following command in **Windows PowerShell**. ``` wsl --install ``` + * Download Ubuntu from **Microsoft Store**, **Ubuntu 20.04 LTS** * Configure Ubuntu with a **username** and **password** * Open **Ubuntu 20.04 LTS** and start working + ``` sudo apt update ``` -* Follow this [tutorial](https://ubuntu.com/tutorials/install-ubuntu-on-wsl2-on-windows-10#1-overview) for more information. -**** +* Follow this [tutorial](https://ubuntu.com/tutorials/install-ubuntu-on-wsl2-on-windows-10#1-overview) for more information. -**Step 1 : Clone the Zingg Repository** +**Step 1: Clone The Zingg Repository** * Install and SetUp Git: **sudo apt install git** * Verify : **git --version** * Set up Git by following the [tutorial](https://www.digitalocean.com/community/tutorials/how-to-install-git-on-ubuntu-20-04). * Clone the Zingg Repository: **git clone https://github.com/zinggAI/zingg.git** -**Note :-** It is suggested to fork the repository to your account and then clone the repository. - -**** - -**Step 2 : Install JDK 11 (Java Development Kit)** +**Note:** It is suggested to fork the repository to your account and then clone the repository. -* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java11 JDK11 in Ubuntu. +**Step 2: Install JDK 1.8 (Java Development Kit)** +* Follow this [tutorial](https://linuxize.com/post/install-java-on-ubuntu-20-04/) to install Java8 JDK1.8 in Ubuntu. * For example: + ``` sudo apt install openjdk-11-jdk openjdk-11-jre javac -version java -version ``` -**** - -**Step 3 : Install Apache Spark -** +**Step 3: Install Apache Spark** * Download Apache Spark - from the [Apache Spark Official Website](https://spark.apache.org/downloads.html). * Install downloaded Apache Spark - on your Ubuntu by following [this tutorial](https://computingforgeeks.com/how-to-install-apache-spark-on-ubuntu-debian/). - * For example for 3.5.0: + ``` wget https://www.apache.org/dyn/closer.lua/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz tar -xvf spark-3.5.0-bin-hadoop3.tgz @@ -61,17 +59,15 @@ rm -rf spark-3.5.0-bin-hadoop3.tgz sudo mv spark-3.5.0-bin-hadoop3 /opt/spark ``` -Make sure that spark version you have installed is compatible with java you have installed, and Zingg is supporting those versions. - -**Note :-** Zingg supports Spark 3.5 and the corresponding Java version. - -**** +Make sure that Spark version you have installed is compatible with Java you have installed, and Zingg is supporting those versions. -**Step 4 : Install Apache Maven** +**Note**: Zingg supports Spark 3.5 and the corresponding Java version. -* Install the latest maven package. +**Step 4: Install Apache Maven** +* Install the latest **maven** package. * For example for 3.8.8: + ``` wget https://dlcdn.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz tar -xvf apache-maven-3.8.8-bin.tar.gz @@ -86,11 +82,10 @@ Maven home: /usr/share/maven Java version: 11.0.23, vendor: Ubuntu, runtime: /usr/lib/jvm/java-11-openjdk-amd64 ``` -**** +**Step 5: Update Environment Variables** -**Step 5 : Update Env Variables** +Open `.bashrc` and add env variables at the end of the file. -* Open .bashrc and add env variables at end of file ``` vim ~/.bashrc export SPARK_HOME=/opt/spark @@ -117,13 +112,12 @@ mvn --version ``` -**Note :-** If you have already set up **JAVA\_HOME** and **SPARK\_HOME** in the steps before you don't need to do this again. +**Note:** If you have already set up **JAVA\_HOME** and **SPARK\_HOME** in the steps before you don't need to do this again. -**** +**Step 6: Compile The Zingg Repository** -**Step 6 : Compile the Zingg Repository** +* Run the following to compile the Zingg Repository - -* Ensure you are on main branch ``` git branch @@ -141,35 +135,18 @@ mvn initialize mvn clean compile package -Dspark=sparkVer -Dmaven.test.skip=true ``` -**Note :-** Replace the **sparkVer** with the version of spark you installed, For example, **-Dspark=3.5** and if still facing error, exclude tests while compiling. - - -**Note :-** substitute 3.3 with profile of the spark version you have installed. This is based on profiles specified in pom.xml -**** - -**Step 7 : If had any issue with 'SPARK\_LOCAL\_IP'** - -* Install **net-tools** -``` -sudo apt-get install -y net-tools -``` - -* Run command in the terminal to get IP address -``` -ifconfig -``` +**Note:** Replace the `sparkVer` with the version of Spark you installed. \ +\ +For example, **-Dspark=3.5** you still face an error, include **-Dmaven.test.skip=true** with the above command. -* Paste the IP in **/opt/hosts** IP address of your Pc-Name -**** +**Step 7: If you have any issue with 'SPARK\_LOCAL\_IP'** -**Step 8 : Run Zingg to Find Training Data** +* Install **net-tools** using **sudo apt-get install -y net-tools** +* Run `ifconfig` in the terminal, find the **IP address** and paste the same in **/opt/hosts** IP address of your Pc-Name -* Run this Script in terminal opened in zingg clones directory - -``` -./scripts/zingg.sh --phase findTrainingData --conf examples/febrl/config.json -``` +**Step 8: Run Zingg To Find Training Data** -**** +* Run this script in the terminal opened in Zingg clones directory `./scripts/zingg.sh --phase findTrainingData --conf examples/febrl/config.json` -**If everything is right, it should show Zingg banner.** +**If everything is right, it should show Zingg icon.** diff --git a/docs/setup/hardwareSizing.md b/docs/setup/hardwareSizing.md index 3e69e6244..d13df4d86 100644 --- a/docs/setup/hardwareSizing.md +++ b/docs/setup/hardwareSizing.md @@ -9,15 +9,15 @@ description: Hardware required for different sizes of data Zingg has been built to scale. Performance is dependent on: -* The number of records to be matched. -* The number of fields to be compared against each other. -* The actual number of duplicates. +* The **number of records** to be matched. +* The **number of fields** to be compared against each other. +* The **actual number** of duplicates. Here are some performance numbers you can use to determine the appropriate hardware for your data. -* 120k records of examples/febrl120k/test.csv take 5 minutes to run on a 4 core, 10 GB RAM local Spark cluster. +* 120k records of **examples/febrl120k/test.csv** take 5 minutes to run on a 4 core, 10 GB RAM local Spark cluster. * 5m records of [North Carolina Voters](https://github.com/zinggAI/zingg/tree/main/examples/ncVoters5M) take \~4 hours on a 4 core, 10 GB RAM local Spark cluster. * 9m records with 3 fields - first name, last name, email take 45 minutes to run on AWS m5.24xlarge instance with 96 cores, 384 GB RAM -* 80m records with 8-10 fields took less than 2 hours on 1 driver(128 GB RAM, 32 cores), 8 workers(224 GB RAM, 64 cores). This is a user-reported stat without any optimization. +* 80m records with 8-10 fields took less than 2 hours on 1 driver (128 GB RAM, 32 cores), 8 workers (224 GB RAM, 64 cores). This is a user-reported stat without any optimization. If you have up to a few million records, it may be easier to run Zingg on a single machine in Spark local mode. diff --git a/docs/setup/installation.md b/docs/setup/installation.md index ce5976b77..6fbcd35cc 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -7,7 +7,6 @@ description: From the pre-built release hosted on GitHub # Installation -Zingg runs on [Spark](https://spark.apache.org) and can be used on all major Spark distributions. Zingg can run on all major Linux flavors. - -Zingg can be installed from the [release on github](../stepbystep/installation/installing-from-release/), [built from sources](../stepbystep/installation/compiling-from-source.md) or used [directly through Docker](../stepbystep/installation/docker/). The next sections describe each option in more detail. +Zingg runs on [Spark](https://spark.apache.org) and can be used on all major Spark distributions. Zingg can run on all major Linux flavors. +Zingg can be installed from the [release on GitHub](../stepbystep/installation/installing-from-release/), [built from sources](../stepbystep/installation/compiling-from-source.md) or used [directly through Docker](../stepbystep/installation/docker/). The next sections describe each option in more detail. diff --git a/docs/setup/link.md b/docs/setup/link.md index 88140984a..760ca30f6 100644 --- a/docs/setup/link.md +++ b/docs/setup/link.md @@ -1,11 +1,11 @@ -# Linking across datasets +# Linking Across Datasets -In many cases like reference data mastering, enrichment, etc, two individual datasets are duplicates free but they need to be matched against each other. The link phase is used for such scenarios. +In many cases like reference data mastering, enrichment, etc, two individual datasets are duplicate-free but they need to be matched against each other. The link phase is used for such scenarios. `./zingg.sh --phase link --conf config.json` Sample configuration file [configLink.json](../../examples/febrl/configLink.json) is defined at [examples/febrl](https://github.com/zinggAI/zingg/tree/main/examples/febrl). In this option, each record from the first source is matched with all the records from the remaining sources. -The sample output is given in the image below. The linked records are given the same z\_cluster id. The last column (z\_source) in the output tells the source dataset of that record. +The sample output is given in the image below. The linked records are given the same **z\_cluster** id. The last column (**z\_source**) in the output tells the source dataset of that record. -![Link results](../../assets/link.png) +![Link Results](../../assets/link.png) diff --git a/docs/setup/match.md b/docs/setup/match.md index 0e1fd35d7..c4fdc1f93 100644 --- a/docs/setup/match.md +++ b/docs/setup/match.md @@ -1,17 +1,17 @@ --- -layout: default title: Find the matches parent: Step By Step Guide nav_order: 8 --- -### match -Finds the records which match with each other. +# Finding The Matches + +## Finds the records that match with each other. `./zingg.sh --phase match --conf config.json` -As can be seen in the image below, matching records are given the same z_cluster id. Each record also gets a z_minScore and z_maxScore which shows the least/greatest it matched with other records in the same cluster. +As can be seen in the image below, matching records are given the same **z\_cluster** id. Each record also gets a **z\_minScore** and **z\_maxScore** which shows the _least/greatest_ it matched with other records in the same cluster. -![Match results](/assets/match.gif) +![Match results](../../assets/match.gif) -If records across multiple sources have to be matched, the [link phase](./link.md) should be used. +If records across multiple sources have to be matched, the [link phase](link.md) should be used. diff --git a/docs/setup/train.md b/docs/setup/train.md index 413e50aae..f40cc508b 100644 --- a/docs/setup/train.md +++ b/docs/setup/train.md @@ -1,10 +1,14 @@ --- -layout: default title: Build and save the model parent: Step By Step Guide nav_order: 7 --- -### train - training and saving the models -Builds up the Zingg models using the training data from the above phases and writes them to the folder zinggDir/modelId as specified in the config. - ./zingg.sh --phase train --conf config.json +# Building And Saving The Model + +Builds up the Zingg models using the training data from the above phases and writes them to the folder **zinggDir/modelId** as specified in the config. + +``` +./zingg.sh --phase train --conf config.json +``` + diff --git a/docs/setup/training/addOwnTrainingData.md b/docs/setup/training/addOwnTrainingData.md index e05fec60e..9ad57e86f 100644 --- a/docs/setup/training/addOwnTrainingData.md +++ b/docs/setup/training/addOwnTrainingData.md @@ -5,18 +5,18 @@ title: Using preexisting training data grand_parent: Step By Step Guide --- -# Using pre-existing training data +# Using Pre-existing Training Data -## Supplementing Zingg with existing training data +## Supplementing Zingg With Existing Training Data -If you already have some training data that you want to start with, you can use that as well with Zingg. Add an attribute trainingSamples to the config and define the training pairs. +If you already have some training data that you want to start with, you can use that as well with Zingg. Add an attribute **trainingSamples** to the config and define the training pairs. -The training data supplied to Zingg should have a z\_cluster column that groups the records together. The z\_cluster uniquely identies the group. We also need to add the z\_isMatch column which is 1 if the pairs match or 0 if they do not match. The z\_isMatch value has to be same for all the records in the z\_cluster group. They either match with each other or they dont. +The training data supplied to Zingg should have a **z\_cluster** column that groups the records together. The **z\_cluster** uniquely identifies the group. We also need to add the **z\_isMatch** column which is **1** if the pairs _match_ or **0** if they do _not_ match. The **z\_isMatch** value has to be the same for all the records in the **z\_cluster** group. They either match with each other or they don't. -An example is provided in [Github training data](../../../examples/febrl/training.csv). +An example is provided in [GitHub training data](../../../examples/febrl/training.csv). The above training data can be specified using [trainingSamples attribute in the configuration.](../../../examples/febrl/configWithTrainingSamples.json) In addition, labeled data of one model can also be exported and used as training data for another model. For details, check out [exporting labeled data](exportLabeledData.md). -Please note: It is advisable to still run [findTrainingData](findTrainingData.md) and [label](label.md) a few rounds to tune Zingg with the supplied training data as well as patterns it needs to learn independently. +**Note**: It is advisable to still run [findTrainingData](findTrainingData.md) and [label](label.md) a few rounds to tune Zingg with the supplied training data as well as patterns it needs to learn independently. diff --git a/docs/setup/training/createTrainingData.md b/docs/setup/training/createTrainingData.md index 0246ae013..b016c466f 100644 --- a/docs/setup/training/createTrainingData.md +++ b/docs/setup/training/createTrainingData.md @@ -5,6 +5,6 @@ title: Creating training data has_children: true --- -# Training data +# Working With Training Data -Zingg builds models to predict similarity. Training data is needed to build these models. The next sections describe how you can use the Zingg Interactive Labeler to create the training data. +Zingg builds models to predict _similarity_. Training data is needed to build these models. The next sections describe how you can use the **Zingg Interactive Labeler** to create the training data. diff --git a/docs/setup/training/findAndLabel.md b/docs/setup/training/findAndLabel.md index 152a6b66e..152d78fc3 100644 --- a/docs/setup/training/findAndLabel.md +++ b/docs/setup/training/findAndLabel.md @@ -7,8 +7,8 @@ nav_order: 2 # Find And Label -This phase is composed of two phases namely [findTrainingData](findTrainingData.md) and [label](label.md). This will help experienced users to quicken the process of creating Training data. +This phase is composed of two phases namely [findTrainingData](findTrainingData.md) and [label](label.md). This will help experienced users to quicken the process of creating training data. `./zingg.sh --phase findAndLabel --conf config.json` -As this is phase runs findTrainingData and label together, it should be run only for small datasets where findTrainingData takes a short time to run, else the the user will have to wait long for the console for labeling. +As this is phase runs **findTrainingData** and **label** together, it should be run only for small datasets where **findTrainingData** takes a short time to run, else the the user will have to wait long for the console to label. diff --git a/docs/setup/training/findTrainingData.md b/docs/setup/training/findTrainingData.md index 5430b50b9..e3e04437f 100644 --- a/docs/setup/training/findTrainingData.md +++ b/docs/setup/training/findTrainingData.md @@ -7,10 +7,10 @@ description: pairs of records that could be similar to train Zingg # Finding Records For Training Set Creation -The findTrainingData phase prompts Zingg to search for edge cases in the data which can be labeled by the user and used for learning. During this phase, Zingg combs through the data samples and judiciously selects limited representative pairs which can be marked by the user. Zingg is very frugal about the training so that user effort is minimized and models can be built and deployed quickly. +The **findTrainingData** phase prompts Zingg to search for edge cases in the data which can be labeled by the user and used for learning. During this phase, Zingg combs through the data samples and judiciously selects limited representative pairs that can be marked by the user. Zingg is very frugal about the training so that user effort is minimized and models can be built and deployed quickly. -This findTrainingData job writes the edge cases to the folder configured through zinggDir/modelId in the config. +This **findTrainingData** job writes the edge cases to the folder configured through `zinggDir/modelId` in the config: `./zingg.sh --phase findTrainingData --conf config.json` -The findTrainingData phase is run first and then the label phase is run and this cycle is repeated so that the Zingg models get smarter from user feedback. +The **findTrainingData** phase is run first and then the label phase is run and this cycle is repeated so that the Zingg models get smarter from user feedback. diff --git a/docs/setup/training/label.md b/docs/setup/training/label.md index 4081f08b5..77dbf1123 100644 --- a/docs/setup/training/label.md +++ b/docs/setup/training/label.md @@ -7,12 +7,12 @@ description: Providing user feedback on the training pairs # Labeling Records -The label phase opens an interactive learner where the user can mark the pairs found by findTrainingData phase as matches or non-matches. The findTrainingData phase generates edge cases for labeling and the label phase helps the user mark them. +The label phase opens an _interactive_ learner where the user can mark the pairs found by **findTrainingData** phase as matches or non-matches. The **findTrainingData** phase generates edge cases for labeling and the label phase helps the user mark them. `./zingg.sh --phase label --conf config.json ` ![Shows records and asks user to mark yes, no, can't say on the cli.](../../../assets/label.gif) -Proceed running findTrainingData followed by label phases till you have at least 30-40 positives, or when you see the predictions by Zingg converging with the output you want. At each stage, the user will get different variations of attributes across the records. Zingg performs pretty well with even a small number of training, as the samples to be labeled are chosen by the algorithm itself. +Proceed running **findTrainingData** followed by label phases till you have at least 30-40 positives, or when you see the predictions by Zingg converging with the output you want. At each stage, the user will get different variations of attributes across the records. Zingg performs pretty well with even a small number of training, as the samples to be labeled are chosen by the algorithm itself. -The showConcise flag when passed to the Zingg command line only shows fields which are NOT DONT\_USE +The **showConcise** flag when passed to the Zingg command line only shows fields which are **NOT DONT\_USE** diff --git a/docs/stepByStep.md b/docs/stepByStep.md index f64352dc2..d11f06d04 100644 --- a/docs/stepByStep.md +++ b/docs/stepByStep.md @@ -8,23 +8,23 @@ description: Instructions on how to install and use Zingg Installation instructions for docker, as well as GitHub release, are [here](setup/installation.md). If you need to build from the sources or compile for a different flavor of Spark, check [compiling](setup/installation.md#compiling-from-sources). -## Step 2: Plan for Hardware +## Step 2: Plan For Hardware Decide your hardware based on the [performance numbers](setup/hardwareSizing.md). -## Step 3: Build the config for your data +## Step 3: Build The Config For Your Data Zingg needs a configuration file that defines the data and what kind of matching is needed. You can create the configuration file by following the instructions [here](stepbystep/configuration/). -## Step 4: Create the training data +## Step 4: Create Training Data Zingg builds a new set of models(blocking and similarity) for every new schema definition(columns and match types). This means running the _findTrainingData_ and _label_ phases multiple times to build the training dataset from which Zingg will learn. You can read more [here](setup/training/createTrainingData.md). -## Step 5: Build and save the model +## Step 5: Build & Save The Model The training data in Step 4 above is used to train Zingg and build and save the models. This is done by running the _train_ phase. Read more [here](setup/train.md). -## Step 6: Voila, let's match! +## Step 6: Voila, Let's Match! It's now time to apply the model to our data. This is done by running the _match_ or the _link_ phases depending on whether you are matching within a single source or linking multiple sources respectively. You can read more about [matching](setup/match.md) and [linking](setup/link.md). diff --git a/docs/stepbystep/configuration/README.md b/docs/stepbystep/configuration/README.md index 23417002c..28a1ee262 100644 --- a/docs/stepbystep/configuration/README.md +++ b/docs/stepbystep/configuration/README.md @@ -4,11 +4,8 @@ description: JSON configuration file required to work with Zingg # Configuration -Zingg comes with a command-line script that invokes spark-submit. This script needs a JSON configuration file to define the input data and match types, location of training data, models, and output. +Zingg comes with a command-line script that invokes `spark-submit`. This script needs a **JSON configuration** file to define the input data and match types, location of training data, models, and output. Sample configuration files are defined at [examples/febrl](https://github.com/zinggAI/zingg/tree/main/examples/febrl) and [examples/febrl120k](https://github.com/zinggAI/zingg/tree/main/examples/febrl120k). The following sections describe the different JSON attributes of the configuration to get Zingg to work with your data. - - - diff --git a/docs/stepbystep/configuration/configuring-through-environment-variables.md b/docs/stepbystep/configuration/configuring-through-environment-variables.md index 540f15385..444806dc8 100644 --- a/docs/stepbystep/configuration/configuring-through-environment-variables.md +++ b/docs/stepbystep/configuration/configuring-through-environment-variables.md @@ -4,7 +4,7 @@ description: Passing Configuration value through the system environment variable # Configuring Through Environment Variables -If a user does not want to pass the value of any JSON parameter through the config file for security reasons or otherwise, they can configure that value through the system environment variable. The system variable name needs to be put in the config file in place of its JSON value. At runtime, the config file gets updated with the value of the environment variable. +If a user does not want to pass the value of any JSON parameter through the config file for security reasons or otherwise, they can configure that value through the _system environment variable_. The system variable name needs to be put in the config file in place of its JSON value. At runtime, the config file gets updated with the value of the environment variable. Below is the config file snippet that references a few environment variables. @@ -27,4 +27,4 @@ Below is the config file snippet that references a few environment variables. "collectMetrics": $collectMetrics$ ``` -Environment variables must be enclosed within dollar signs **$var$** to take effect. Also, the config file name must be suffixed with \***.env**. As usual, String variables need to be put within quotes **"$var$"**, Boolean and Numeric values should be put without quotes **$var$**. +Environment variables must be enclosed within dollar signs **$var$** to take effect. Also, the config file name must be _suffixed_ with \***.env**. As usual, `String`variables need to be put _within_ quotes **"$var$"**, `Boolean` and `Numeric` values should be put _without_ quotes **$var$**. diff --git a/docs/stepbystep/configuration/field-definitions.md b/docs/stepbystep/configuration/field-definitions.md index 6c0376983..a7cea17ec 100644 --- a/docs/stepbystep/configuration/field-definitions.md +++ b/docs/stepbystep/configuration/field-definitions.md @@ -10,27 +10,27 @@ description: >- This is a JSON array representing the fields from the source data to be used for matching, and the kind of matching they need. -Each field denotes a column from the input. Fields have the following JSON attributes: +Each field denotes a **column** from the input. Fields have the following JSON attributes: **fieldName** -The name of the field from the input data schema +The **name** of the field from the input data schema **fields** -To be defined later. For now, please keep this as the fieldName +To be defined later. For now, please keep this as the `fieldName` **dataType** -Type of the column - string, integer, double, etc. +Type of the column - `string, integer, double`, etc. **matchType** - The way to match the given field. Multiple match types, separated by commas, can also be used. For example FUZZY,NUMERIC. Here are the different types supported. +The way to match the given field. Multiple match types, separated by commas, can also be used. For example **FUZZY**, **NUMERIC**. Here are the different types supported: #### showConcise -| Match Type | Description | Can be applied to | +| Match Type | Description | Applicable To | | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- | | FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, long, double, date | | EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string, integer, long, date | @@ -43,5 +43,4 @@ Type of the column - string, integer, double, etc. | NUMERIC\_WITH\_UNITS | extracts product codes or numbers with units, for example 16gb from strings and compares how many are same across both strings | string | | ONLY\_ALPHABETS\_EXACT | only looks at the alphabetical characters and compares if they are exactly the same. when the numbers inside strings do not matter, for example if you are looking at buildings but want to ignore flat numbers | string | | ONLY\_ALPHABETS\_FUZZY | ignores any numbers in the strings and then does a fuzzy comparison, useful for fields like addresses with typos where you want to look at street number separately using NUMERIC | string | - #### diff --git a/docs/stepbystep/configuration/model-location.md b/docs/stepbystep/configuration/model-location.md index b5a5ae0cf..7aa86e685 100644 --- a/docs/stepbystep/configuration/model-location.md +++ b/docs/stepbystep/configuration/model-location.md @@ -2,8 +2,8 @@ #### zinggDir -The location where trained models will be saved. Defaults to /tmp/zingg +The **location** where trained models will be saved. Defaults to `/tmp/zingg` #### modelId -An identifier for the model. You can train multiple models - say, one for customers matching names, age, and other personal details and one for households matching addresses. Each model gets saved under zinggDir/modelId +An **identifier** for the model. You can train multiple models - say, one for **customers** matching _names_, _age_, and other personal details and one for **households** matching _addresses_. Each model gets saved under `zinggDir/modelId` diff --git a/docs/stepbystep/configuration/telemetry.md b/docs/stepbystep/configuration/telemetry.md index 486714f5a..71d3ac25a 100644 --- a/docs/stepbystep/configuration/telemetry.md +++ b/docs/stepbystep/configuration/telemetry.md @@ -10,4 +10,4 @@ Application captures a few measurements for runtime metrics such as _no. of data **Zingg does not capture any user data or input data and will never do so.** -This feature may be disabled by setting this flag to false. The default value is true. For details, refer to [Security And Privacy](../../security.md). +This feature may be disabled by setting this flag to **false**. The default value is **true**. For details, refer to [Security And Privacy](../../security.md). diff --git a/docs/stepbystep/configuration/tuning-label-match-and-link-jobs.md b/docs/stepbystep/configuration/tuning-label-match-and-link-jobs.md index e4553f543..4ca6ea578 100644 --- a/docs/stepbystep/configuration/tuning-label-match-and-link-jobs.md +++ b/docs/stepbystep/configuration/tuning-label-match-and-link-jobs.md @@ -6,4 +6,4 @@ The number of Spark partitions over which the input data is distributed. Keep it #### labelDataSampleSize -Fraction of the data to be used for training the models. Adjust it between 0.0001 and 0.1 to keep the sample size small enough so that it finds enough edge cases fast. If the size is bigger, the findTrainingData job will spend more time combing through samples. If the size is too small, Zingg may not find the right edge cases. +Fraction of the data to be used for training the models. Adjust it between 0.0001 and 0.1 to keep the sample size small enough so that it finds enough edge cases fast. If the size is bigger, the `findTrainingData` job will spend more time combing through samples. If the size is too small, Zingg may not find the right edge cases. diff --git a/docs/stepbystep/installation/compiling-from-source.md b/docs/stepbystep/installation/compiling-from-source.md index d59f15e3b..3f02588b4 100644 --- a/docs/stepbystep/installation/compiling-from-source.md +++ b/docs/stepbystep/installation/compiling-from-source.md @@ -6,9 +6,8 @@ description: For a different Spark version or compiling latest code If you need to compile the latest code or build for a different Spark version, you can clone this repo and -* Install maven -* Install JDK 11 -* Set JAVA\_HOME to JDK base directory -* Run the following: `mvn initialize` and then `mvn clean compile package` - +* Install **maven** +* Install **JDK 1.8** +* Set `JAVA_HOME` to JDK base directory +* Run the following: `mvn initialize` and then `mvn clean compile package` diff --git a/docs/stepbystep/installation/docker/README.md b/docs/stepbystep/installation/docker/README.md index afce0e9ac..1cb0ede45 100644 --- a/docs/stepbystep/installation/docker/README.md +++ b/docs/stepbystep/installation/docker/README.md @@ -4,7 +4,7 @@ description: From pre-built Docker image with all dependencies included # Docker -## Running from Docker image from Docker hub +## Running From Docker Image From Docker Hub The easiest way to get started is to pull the Docker image with the last release of Zingg. @@ -17,6 +17,6 @@ In case of permission denied, try mapping /tmp of docker with user's machine /tm docker run -v /tmp:/tmp -it zingg/zingg:0.4.0 bash ``` -To know more about Docker, please refer to the official [docker documentation](https://docs.docker.com/). +To know more about Docker, please refer to the official [Docker documentation](https://docs.docker.com/). ## diff --git a/docs/stepbystep/installation/docker/copying-files-to-and-from-the-container.md b/docs/stepbystep/installation/docker/copying-files-to-and-from-the-container.md index 57d5cd355..60f70e0a2 100644 --- a/docs/stepbystep/installation/docker/copying-files-to-and-from-the-container.md +++ b/docs/stepbystep/installation/docker/copying-files-to-and-from-the-container.md @@ -2,9 +2,9 @@ description: Alternative to volume/bind mount --- -# Copying Files To and From the Container +# Copying Files To And From The Container -A quick alternative to **Volume/bind Mount** is to just copy necessary files to and forth between the host and the container. +A quick alternative to **Volume/Bind Mount** is to just copy necessary files to and forth between the host and the container. One specific file/directory can be copied TO and FROM the container. e.g. @@ -13,9 +13,8 @@ $ docker cp foo.txt :/foo.txt $ docker cp :/foo.txt foo.txt ``` -The container id of the running instance can be found using the below command. +The _container id_ of the running instance can be found using the below command: ``` $ docker ps ``` - diff --git a/docs/stepbystep/installation/docker/file-read-write-permissions.md b/docs/stepbystep/installation/docker/file-read-write-permissions.md index a422f3b6a..472b10dcb 100644 --- a/docs/stepbystep/installation/docker/file-read-write-permissions.md +++ b/docs/stepbystep/installation/docker/file-read-write-permissions.md @@ -2,9 +2,9 @@ description: To enable user to have create/read/write files in shared location --- -# File read/write permissions +# File Read/Write Permissions -A docker image is preferred to run with a non-root user. By default, the Zingg container runs with uid '1001'. A valid 'uid' can be passed through the command line in order to run the container with that user id. This will enable the user to have requisite permissions to create/read/write files in the shared location. +A docker image is preferred to run with a non-root user. By default, the Zingg container runs with **uid '1001'**. A valid '_uid_' can be passed through the command line in order to run the container with that user id. This will enable the user to have requisite permissions to create/read/write files in the shared location. ``` $ id diff --git a/docs/stepbystep/installation/docker/shared-locations.md b/docs/stepbystep/installation/docker/shared-locations.md index e580b84de..ecf9f7c4b 100644 --- a/docs/stepbystep/installation/docker/shared-locations.md +++ b/docs/stepbystep/installation/docker/shared-locations.md @@ -2,14 +2,14 @@ description: Shared location used to store Zingg configurations --- -# Shared locations +# Shared Locations -## Zingg configurations using shared location +## Zingg Configurations Using Shared Location -The **zinggDir** location where model information is stored may use a shared location. In fact, any oft-editable file such as config.json should be kept in this location only. +The **zinggDir** location where model information is stored may use a shared location. In fact, any _oft-editable_ file such as **config.json** should be kept in this location only. ``` zingg.sh --phase label --conf config.json --zinggDir /location ``` -Similarly, the output and data dir [configurations](../../../stepbystep/configuration) inside config.json can be made using a shared location. Please ensure that the running user has access permissions for this location. +Similarly, the output and data dir [configurations](../../configuration/) inside **config.json** can be made using a shared location. Please ensure that the running user has access permissions for this location. diff --git a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md index c81fd3c95..73d609c17 100644 --- a/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md +++ b/docs/stepbystep/installation/docker/sharing-custom-data-and-config-files.md @@ -2,9 +2,9 @@ description: Using custom data to save data files on host machine --- -# Sharing custom data and config files +# Sharing Custom Data And Config Files -However, note that once the docker container is stopped, all the work done in that session is lost. If we want to use custom data or persist the generated model or data files, we have to use **Volumes** or **Bind mount** to share files between the two. +However, note that once the docker container is stopped, all the work done in that session is lost. If we want to use custom data or persist the generated model or data files, we have to use **Volumes** or **Bind Mount** to share files between the two. ``` docker run -v : -it zingg/zingg:0.4.1-SNAPSHOT bash diff --git a/docs/stepbystep/installation/installing-from-release/README.md b/docs/stepbystep/installation/installing-from-release/README.md index 41df9e958..860da3c21 100644 --- a/docs/stepbystep/installation/installing-from-release/README.md +++ b/docs/stepbystep/installation/installing-from-release/README.md @@ -4,7 +4,7 @@ description: From the pre-built release hosted on GitHub # Installing From Release -Zingg is prebuilt for common Spark versions so that you can use those directly. The following document assumes that we are installing Zingg 0.3 on Spark 3.1.2, but you can follow the same process for other versions too. +Zingg is prebuilt for common Spark versions so that you can use those directly. The following document assumes that we are installing **Zingg 0.3** on **Spark 3.1.2**, but you can follow the same process for other versions too. ## Prerequisites @@ -12,9 +12,6 @@ A) Java JDK - version "11.0.23" B) Apache Spark - version spark-3.5.0-bin-hadoop3 -**** - - +*** #### - diff --git a/docs/stepbystep/installation/installing-from-release/installing-zingg.md b/docs/stepbystep/installation/installing-from-release/installing-zingg.md index 3308f6a47..e39f05e70 100644 --- a/docs/stepbystep/installation/installing-from-release/installing-zingg.md +++ b/docs/stepbystep/installation/installing-from-release/installing-zingg.md @@ -6,14 +6,14 @@ description: Downloading and setting things up Download the tar zingg-version.tar.gz from the [Zingg releases page](https://github.com/zinggAI/zingg/releases) to a folder of your choice and run the following: -> gzip -d zingg-0.4.1-SNAPSHOT.tar.gz ; tar xvf zingg-0.4.1-SNAPSHOT.tar +> `gzip -d zingg-0.4.0.tar.gz ; tar xvf zingg-0.4.0.tar` -This will create a folder zingg-0.4.1-SNAPSHOT under the chosen folder. +This will create a folder _zingg-0.4.0_ under the chosen folder. Move the above folder to zingg. -> mv zingg-0.4.1-SNAPSHOT \~/zingg +> `mv zingg-0.4.0 ~/zingg` -> export ZINGG\_HOME=path to zingg +> `export ZINGG_HOME=path to zingg` -> export PATH=$PATH:$JAVA\_HOME/bin:$SPARK\_HOME/bin:$SPARK\_HOME/sbin:ZINGG\_HOME/scripts +> `export PATH=$PATH:$JAVA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin:ZINGG_HOME/scripts` diff --git a/docs/stepbystep/installation/installing-from-release/single-machine-setup.md b/docs/stepbystep/installation/installing-from-release/single-machine-setup.md index 629dfbfb1..d0503db6b 100644 --- a/docs/stepbystep/installation/installing-from-release/single-machine-setup.md +++ b/docs/stepbystep/installation/installing-from-release/single-machine-setup.md @@ -8,18 +8,18 @@ Zingg can be easily run on a single machine to process up to a few million recor To prepare your machine, please do the following steps: -A) Install the specified JDK version +A) Install the specified **JDK** version B) Apache Spark - Download the specified version from spark.apache.org and unzip it in a folder under home -Please add the following entries to \~/.bash\_aliases +Please add the following entries to `~/.bash_aliases` -> export JAVA\_HOME=path to jdk +> `export JAVA_HOME=path to jdk` -> export SPARK\_HOME=path to location of Apache Spark +> `export SPARK_HOME=path to location of Apache Spark` -> export SPARK\_MASTER=local\[\*] +> `export SPARK_MASTER=local[*]` -C) Correct entry of host under /etc/hosts +C) Correct entry of host under `/etc/hosts` -Run ifconfig to find the IP of the machine and make sure it is added to the /etc/hosts for localhost. +Run `ifconfig` to find the IP of the machine and make sure it is added to the `/etc/hosts` for localhost. diff --git a/docs/stepbystep/installation/installing-from-release/spark-cluster-checklist.md b/docs/stepbystep/installation/installing-from-release/spark-cluster-checklist.md index 555a6f44c..af2e8b3d9 100644 --- a/docs/stepbystep/installation/installing-from-release/spark-cluster-checklist.md +++ b/docs/stepbystep/installation/installing-from-release/spark-cluster-checklist.md @@ -6,8 +6,8 @@ description: To configure Zingg in a Spark cluster If you have a ready Spark cluster, you can run Zingg by configuring the following environment on your driver machine: -> export JAVA\_HOME=path to jdk +> `export JAVA_HOME=path to jdk` -> export SPARK\_HOME=path to Apache Spark +> `export SPARK_HOME=path to Apache Spark` -> export SPARK\_MASTER=spark://master-host:master-port +> `export SPARK_MASTER=spark://master-host:master-port` diff --git a/docs/stepbystep/installation/installing-from-release/verification.md b/docs/stepbystep/installation/installing-from-release/verification.md index 8b94c9ac1..5b1adc941 100644 --- a/docs/stepbystep/installation/installing-from-release/verification.md +++ b/docs/stepbystep/installation/installing-from-release/verification.md @@ -22,6 +22,6 @@ Let us now run a sample program to ensure that our installation is correct. > `./scripts/zingg.sh --phase trainMatch --conf examples/febrl/config.json` -The above will build Zingg models and use that to find duplicates in the examples/febl/test.csv file. You will see Zingg logs on the console and once the job finishes, you will see some files under /tmp/zinggOutput with matching records sharing the same cluster id. +The above will build Zingg models and use that to find duplicates in the **examples/febl/test.csv** file. You will see Zingg logs on the console and once the job finishes, you will see some files under **/tmp/zinggOutput** with matching records sharing the same _cluster id_. Congratulations, Zingg has been installed! diff --git a/docs/stepbystep/zingg-command-line.md b/docs/stepbystep/zingg-command-line.md index 59dfc70d3..54bdd9c61 100644 --- a/docs/stepbystep/zingg-command-line.md +++ b/docs/stepbystep/zingg-command-line.md @@ -4,12 +4,12 @@ description: Using Zingg command line to interact with Zingg # Zingg Command Line -The Zingg command line interface is the way to interact with Zingg to fire labeling, training, and matching jobs and perform other Zingg functions. +The Zingg command line interface is the way to interact with Zingg to fire _labeling, training,_ and _matching_ jobs and perform other Zingg functions. #### To invoke the command line and pass a JSON configuration : `./scripts/zingg.sh --phase --conf ` -#### To invoke the command line and run python programs : +#### To invoke the command line and run Python programs : `./scripts/zingg.sh --run ` diff --git a/docs/stepbystep/zingg-runtime-properties.md b/docs/stepbystep/zingg-runtime-properties.md index 20b04a062..f686fb35d 100644 --- a/docs/stepbystep/zingg-runtime-properties.md +++ b/docs/stepbystep/zingg-runtime-properties.md @@ -4,8 +4,8 @@ description: Memory, external jars and other runtime properties # Zingg Runtime Properties -Zingg jobs can be passed JVM and other settings through a properties file. A sample file exists at [config/zingg.conf](https://github.com/zinggAI/zingg/blob/main/config/zingg.conf). The properties can be passed by invoking +Zingg jobs can be passed JVM and other settings through a _properties_ file. A sample file exists at [config/zingg.conf](https://github.com/zinggAI/zingg/blob/main/config/zingg.conf). The properties can be passed by invoking -`./scripts/zingg.sh --properties-file --conf conf.json` +`./scripts/zingg.sh --properties-file --conf conf.json` -To include jars for Snowflake/BigQuery/MySQL etc, please download them and add them to the spark.jars property. +To include jars for Snowflake/BigQuery/MySQL etc, please download them and add them to the `spark.jars` property. diff --git a/docs/updatingLabels.md b/docs/updatingLabels.md index af81222dd..0bb1a8695 100644 --- a/docs/updatingLabels.md +++ b/docs/updatingLabels.md @@ -1,12 +1,12 @@ # Updating Labeled Pairs -**Please note: This is an experimental feature. Please keep a backup copy of your model folder in a separate place before running this** +N**ote: This is an experimental feature. Please keep a backup copy of your model folder in a separate place before running this** As our understanding of our data changes, we may need to revisit the previously marked pairs and update them. To do this, please [generate the documentation of the model.](generatingdocumentation.md) You can then invoke the updater by invoking\ `./scripts/zingg.sh --phase updateLabel --conf ` -This brings up the console labeler which accepts the cluster id of the pairs you want to update. +This brings up the console labeler which accepts the **cluster id** of the pairs you want to update. -![Shows records and asks user to update yes, no, cant say on the cli.](../assets/update.gif) +![Shows records and asks user to update yes, no, can't say on the CLI.](../assets/update.gif) diff --git a/docs/working-with-python.md b/docs/working-with-python.md index 5c6f6e2fc..b19857fbe 100644 --- a/docs/working-with-python.md +++ b/docs/working-with-python.md @@ -4,17 +4,15 @@ description: A whole new way to work with Zingg! # Working With Python -Instead of configuring Zingg using the JSON, we can now use Python to build and run Zingg entity and identity resolution programs. This is handy when you want to run Zingg on an existing Spark cluster. To run on local machine, please do the installation of the release before running Zingg python programs. +Instead of configuring Zingg using JSON, we can now use Python to build and run Zingg entity and identity resolution programs. This is handy when you want to run Zingg on an existing Spark cluster. To run on a local machine, please install from the release before running Zingg Python programs. -The Zingg Python package can be installed by invoking +The Zingg Python package can be installed by invoking: `python -m pip install zingg` -Detailed documentation of the python api is available at [https://readthedocs.org/projects/zingg/](https://readthedocs.org/projects/zingg/) +Detailed documentation of the Python API is available at:[https://readthedocs.org/projects/zingg/](https://readthedocs.org/projects/zingg/) -Example programs for python exist under examples. Please check examples/febrl/FebrlExample.py to get started. +Example programs for Python exist under [examples](https://github.com/zinggAI/zingg/tree/main/examples/febrl). Please check [examples/febrl/FebrlExample.py](https://github.com/zinggAI/zingg/blob/main/examples/febrl/FebrlExample.py) to get started. -Please refer to the [command line guide](stepbystep/zingg-command-line.md) for running python programs. Please note that Zingg Python programs are PySpark programs and hence need the Zingg cli to execute. - -`` +Please refer to the [command line guide](stepbystep/zingg-command-line.md) for running Python programs. Please note that Zingg Python programs are PySpark programs and hence need the Zingg CLI to execute. diff --git a/docs/zModels.md b/docs/zModels.md index f377583d0..5f0d66764 100644 --- a/docs/zModels.md +++ b/docs/zModels.md @@ -10,11 +10,11 @@ Zingg learns two models from the data. ## 1. Blocking Model -One fundamental problem with scaling data mastering is that the number of comparisons increases quadratically as the number of input records increases. +One fundamental problem with scaling data mastering is that the number of comparisons increases **quadratically** as the number of input records increases. ![Data Mastering At Scale](../assets/fuzzymatchingcomparisons.jpg) -Zingg learns a clustering/blocking model which indexes near similar records. This means that Zingg does not compare every record with every other record. Typical Zingg comparisons are 0.05-1% of the possible problem space. +Zingg learns a clustering/blocking model which indexes near similar records. This means that Zingg does not compare every record with every other record. Typical Zingg comparisons are **0.05-1%** of the possible problem space. ## 2. Similarity Model diff --git a/python/docs/conf.py b/python/docs/conf.py index 0b6880647..c55d69a8c 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -18,12 +18,13 @@ print(os.path.abspath('../zingg/')) # -- Project information ----------------------------------------------------- -project = 'Zingg' +project = 'Zingg Entity Resolution' copyright = '2024, Zingg.AI' author = 'Zingg.AI' # The full version, including alpha/beta/rc tags release = '0.4.1-SNAPSHOT' +version = '0.4.0' # -- General configuration --------------------------------------------------- @@ -39,6 +40,8 @@ 'sphinx.ext.coverage' ] + + # Add any paths that contain templates here, relative to this directory. #templates_path = ['_templates'] @@ -53,7 +56,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'classic' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, diff --git a/python/docs/index.rst b/python/docs/index.rst index 26777a748..0a34058ae 100644 --- a/python/docs/index.rst +++ b/python/docs/index.rst @@ -3,50 +3,76 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to the documentation of Zingg Entity Resolution With Python package! -================================= +Zingg Entity Resolution Python Package +============================================================================ -Contents: + +Zingg Python APIs for entity resolution, identity resolution, record linkage, data mastering and deduplication using ML +(https://www.zingg.ai) + + +.. note:: + Requires python 3.6+; spark 3.5.0 + Otherwise, :py:func:`zingg.client.Zingg` cannot be executed + .. toctree:: :maxdepth: 3 - + zingg - -Zingg Python APIs for entity resolution, record linkage, data mastering and deduplication using ML -(https://www.zingg.ai) +API Reference +================== +* :ref:`modindex` +* :ref:`genindex` +* :ref:`search` -requires python 3.6+; spark 3.5.0 -Otherwise, :py:func:`zingg.client.Zingg` cannot be executed +Example API Usage +================= -.. toctree:: - :maxdepth: 2 - +.. code:: python + :number-lines: -.. automodule:: zingg - :members: - :undoc-members: - :show-inheritance: + from zingg.client import * + from zingg.pipes import * -.. automodule:: zingg.client - :members: - :undoc-members: - :show-inheritance: + #build the arguments for zingg + args = Arguments() + #set field definitions + fname = FieldDefinition("fname", "string", MatchType.FUZZY) + lname = FieldDefinition("lname", "string", MatchType.FUZZY) + stNo = FieldDefinition("stNo", "string", MatchType.FUZZY) + add1 = FieldDefinition("add1","string", MatchType.FUZZY) + add2 = FieldDefinition("add2", "string", MatchType.FUZZY) + city = FieldDefinition("city", "string", MatchType.FUZZY) + areacode = FieldDefinition("areacode", "string", MatchType.FUZZY) + state = FieldDefinition("state", "string", MatchType.FUZZY) + dob = FieldDefinition("dob", "string", MatchType.FUZZY) + ssn = FieldDefinition("ssn", "string", MatchType.FUZZY) -.. automodule:: zingg.pipes - :members: - :undoc-members: - :show-inheritance: + fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] -Indices and tables -================== + args.setFieldDefinition(fieldDefs) + #set the modelid and the zingg dir + args.setModelId("100") + args.setZinggDir("models") + args.setNumPartitions(4) + args.setLabelDataSampleSize(0.5) -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + #reading dataset into inputPipe and settint it up in 'args' + #below line should not be required if you are reading from in memory dataset + #in that case, replace df with input df + schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" + inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema) + args.setData(inputPipe) + outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput") -.. note:: + args.setOutput(outputPipe) + + options = ClientOptions([ClientOptions.PHASE,"match"]) + + #Zingg execution for the given phase + zingg = Zingg(args, options) + zingg.initAndExecute() - This project is used by Zingg.AI diff --git a/python/docs/zingg.rst b/python/docs/zingg.rst index 2e5369c2b..6955d428b 100644 --- a/python/docs/zingg.rst +++ b/python/docs/zingg.rst @@ -1,9 +1,5 @@ Zingg Entity Resolution Package -============= - - -Contents ---------------- +================================ Zingg Python APIs for entity resolution, record linkage, data mastering and deduplication using ML (https://www.zingg.ai) @@ -12,7 +8,7 @@ requires python 3.6+; spark 3.5.0 Otherwise, :py:func:`zingg.client.Zingg` cannot be executed .. toctree:: - :maxdepth: 2 + :maxdepth: 3 .. automodule:: zingg @@ -28,7 +24,4 @@ Otherwise, :py:func:`zingg.client.Zingg` cannot be executed .. automodule:: zingg.pipes :members: :undoc-members: - :show-inheritance: - - - + :show-inheritance: \ No newline at end of file diff --git a/python/zingg/client.py b/python/zingg/client.py index 7061002fd..0ede096ce 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -20,8 +20,10 @@ _spark_ctxt = None _sqlContext = None _spark = None +<<<<<<< HEAD _zingg_jar = 'zingg-0.4.1-SNAPSHOT.jar' + def initSparkClient(): global _spark_ctxt global _sqlContext @@ -468,7 +470,6 @@ class ZinggWithSpark(Zingg): :type options: ClientOptions """ - def __init__(self, args, options): self.client = getJVM().zingg.spark.client.SparkClient(args.getArgs(), options.getClientOptions(), getSparkSession()._jsparkSession)