From 6ec3719472272b3382f933c33695f02a114a2957 Mon Sep 17 00:00:00 2001
From: Athul <89829560+athul-rs@users.noreply.github.com>
Date: Tue, 26 Mar 2024 15:16:17 +0530
Subject: [PATCH] fix/Removing unused tools (#186)
* fix/Removing unused tools
* fix/tools options removal
---
.../workflows/docker-tools-build-push.yaml | 16 --
tools/doc_pii_redactor/.dockerignore | 3 -
tools/doc_pii_redactor/Dockerfile | 20 --
tools/doc_pii_redactor/README.md | 140 ------------
tools/doc_pii_redactor/__init__.py | 0
tools/doc_pii_redactor/requirements.txt | 8 -
tools/doc_pii_redactor/sample.env | 13 --
tools/doc_pii_redactor/src/config/icon.svg | 53 -----
.../src/config/properties.json | 45 ----
.../src/config/runtime_variables.json | 40 ----
tools/doc_pii_redactor/src/config/spec.json | 72 -------
.../src/doc_pii_redactor/__init__.py | 0
.../src/doc_pii_redactor/constants.py | 7 -
.../src/doc_pii_redactor/enums.py | 6 -
.../src/doc_pii_redactor/helper.py | 157 --------------
tools/doc_pii_redactor/src/main.py | 200 ------------------
tools/indexer/.dockerignore | 3 -
tools/indexer/Dockerfile | 25 ---
tools/indexer/README.md | 127 -----------
tools/indexer/__init__.py | 0
tools/indexer/requirements.txt | 4 -
tools/indexer/sample.env | 7 -
tools/indexer/src/config/icon.svg | 49 -----
tools/indexer/src/config/properties.json | 79 -------
.../indexer/src/config/runtime_variables.json | 7 -
tools/indexer/src/config/spec.json | 26 ---
tools/indexer/src/constants.py | 5 -
tools/indexer/src/main.py | 70 ------
tools/ocr/.dockerignore | 3 -
tools/ocr/Dockerfile | 21 --
tools/ocr/README.md | 112 ----------
tools/ocr/requirements.txt | 9 -
tools/ocr/sample.env | 4 -
tools/ocr/src/config/properties.json | 59 ------
tools/ocr/src/config/spec.json | 14 --
tools/ocr/src/constants.py | 18 --
tools/ocr/src/enums.py | 7 -
tools/ocr/src/helper.py | 129 -----------
tools/ocr/src/main.py | 105 ---------
tools/translate/.dockerignore | 3 -
tools/translate/Dockerfile | 19 --
tools/translate/README.md | 132 ------------
tools/translate/__init__.py | 0
tools/translate/requirements.txt | 6 -
tools/translate/sample.env | 4 -
tools/translate/src/config/icon.svg | 21 --
tools/translate/src/config/properties.json | 41 ----
.../src/config/runtime_variables.json | 15 --
tools/translate/src/config/spec.json | 82 -------
tools/translate/src/constants.py | 72 -------
tools/translate/src/main.py | 160 --------------
51 files changed, 2218 deletions(-)
delete mode 100644 tools/doc_pii_redactor/.dockerignore
delete mode 100644 tools/doc_pii_redactor/Dockerfile
delete mode 100644 tools/doc_pii_redactor/README.md
delete mode 100644 tools/doc_pii_redactor/__init__.py
delete mode 100644 tools/doc_pii_redactor/requirements.txt
delete mode 100644 tools/doc_pii_redactor/sample.env
delete mode 100644 tools/doc_pii_redactor/src/config/icon.svg
delete mode 100644 tools/doc_pii_redactor/src/config/properties.json
delete mode 100644 tools/doc_pii_redactor/src/config/runtime_variables.json
delete mode 100644 tools/doc_pii_redactor/src/config/spec.json
delete mode 100644 tools/doc_pii_redactor/src/doc_pii_redactor/__init__.py
delete mode 100644 tools/doc_pii_redactor/src/doc_pii_redactor/constants.py
delete mode 100644 tools/doc_pii_redactor/src/doc_pii_redactor/enums.py
delete mode 100644 tools/doc_pii_redactor/src/doc_pii_redactor/helper.py
delete mode 100644 tools/doc_pii_redactor/src/main.py
delete mode 100644 tools/indexer/.dockerignore
delete mode 100644 tools/indexer/Dockerfile
delete mode 100644 tools/indexer/README.md
delete mode 100644 tools/indexer/__init__.py
delete mode 100644 tools/indexer/requirements.txt
delete mode 100644 tools/indexer/sample.env
delete mode 100644 tools/indexer/src/config/icon.svg
delete mode 100644 tools/indexer/src/config/properties.json
delete mode 100644 tools/indexer/src/config/runtime_variables.json
delete mode 100644 tools/indexer/src/config/spec.json
delete mode 100644 tools/indexer/src/constants.py
delete mode 100644 tools/indexer/src/main.py
delete mode 100644 tools/ocr/.dockerignore
delete mode 100644 tools/ocr/Dockerfile
delete mode 100644 tools/ocr/README.md
delete mode 100644 tools/ocr/requirements.txt
delete mode 100644 tools/ocr/sample.env
delete mode 100644 tools/ocr/src/config/properties.json
delete mode 100644 tools/ocr/src/config/spec.json
delete mode 100644 tools/ocr/src/constants.py
delete mode 100644 tools/ocr/src/enums.py
delete mode 100644 tools/ocr/src/helper.py
delete mode 100644 tools/ocr/src/main.py
delete mode 100644 tools/translate/.dockerignore
delete mode 100644 tools/translate/Dockerfile
delete mode 100644 tools/translate/README.md
delete mode 100644 tools/translate/__init__.py
delete mode 100644 tools/translate/requirements.txt
delete mode 100644 tools/translate/sample.env
delete mode 100644 tools/translate/src/config/icon.svg
delete mode 100644 tools/translate/src/config/properties.json
delete mode 100644 tools/translate/src/config/runtime_variables.json
delete mode 100644 tools/translate/src/config/spec.json
delete mode 100644 tools/translate/src/constants.py
delete mode 100644 tools/translate/src/main.py
diff --git a/.github/workflows/docker-tools-build-push.yaml b/.github/workflows/docker-tools-build-push.yaml
index bc28f4c34..38592887b 100644
--- a/.github/workflows/docker-tools-build-push.yaml
+++ b/.github/workflows/docker-tools-build-push.yaml
@@ -14,10 +14,6 @@ on:
type: choice
options: # Define available options
- tool-classifier
- - tool-doc-pii-redactor
- - tool-indexer
- - tool-ocr
- - tool-translate
- tool-structure
- tool-text-extractor
@@ -42,18 +38,6 @@ jobs:
- name: Build tool-classifier
if: github.event.inputs.service_name=='tool-classifier'
run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/classifier
- - name: Build tool-doc-pii-redactor
- if: github.event.inputs.service_name=='tool-doc-pii-redactor'
- run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/doc_pii_redactor
- - name: Build tool-indexer
- if: github.event.inputs.service_name=='tool-indexer'
- run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/indexer
- - name: Build tool-ocr
- if: github.event.inputs.service_name=='tool-ocr'
- run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/ocr
- - name: Build tool-translate
- if: github.event.inputs.service_name=='tool-translate'
- run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/translate
- name: Build tool-structure
if: github.event.inputs.service_name=='tool-structure'
run: docker build -t unstract/${{github.event.inputs.service_name}}:${{ github.event.inputs.tag }} ./tools/structure
diff --git a/tools/doc_pii_redactor/.dockerignore b/tools/doc_pii_redactor/.dockerignore
deleted file mode 100644
index c26352afc..000000000
--- a/tools/doc_pii_redactor/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-venv/
-.venv/
-.env
diff --git a/tools/doc_pii_redactor/Dockerfile b/tools/doc_pii_redactor/Dockerfile
deleted file mode 100644
index d6c0786a2..000000000
--- a/tools/doc_pii_redactor/Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-FROM python:3.9-slim
-
-LABEL maintainer="Zipstack Inc."
-ENV UNSTRACT_ENTRYPOINT "python /app/src/main.py"
-
-# Install dependencies for unstructured library's partition
-RUN apt-get update && apt-get --no-install-recommends -y install libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc\
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-RUN pip install --no-cache-dir -U pip
-# Set the working directory in the container
-WORKDIR /app
-COPY requirements.txt /app/
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the contents of your project directory into the container at /app
-COPY src /app/src/
-WORKDIR /app/src
-
-
-ENTRYPOINT ["python", "main.py"]
diff --git a/tools/doc_pii_redactor/README.md b/tools/doc_pii_redactor/README.md
deleted file mode 100644
index 84079367e..000000000
--- a/tools/doc_pii_redactor/README.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Document PII Redactor
-
-This is a tool which can redact the PII (Personally Identifiable Information) data or information from a given set of files.
-It makes use of the `document_service` (found in the repository) to upload, edit and download documents.
-
-## Supported processors
-
-The following processors are supported for PII detection
-
-### Amazon Comprehend
-
-Follow the steps from the [Amazon Comprehend docs](https://aws.amazon.com/comprehend/) to set it up.
-
-## Required environment variables
-
-| Variable | Description |
-| -------------------------- | --------------------------------------------------------------------- |
-| `PLATFORM_SERVICE_HOST` | The host in which the platform service is running |
-| `PLATFORM_SERVICE_PORT` | The port in which the service is listening |
-| `PLATFORM_SERVICE_API_KEY` | The API key for the platform |
-| `TOOL_DATA_DIR` | The directory in the filesystem which has contents for tool execution |
-
-Set the following envs based on the processor you wish to use
-
-| Processor | Variable | Description |
-| --- | --- | --- |
-| Amazon Comprehend | `PII_REDACT_AWS_REGION` | The AWS region for the Amazon Comprehend service |
-| Amazon Comprehend | `PII_REDACT_AWS_ACCESS_KEY_ID` | The access key ID for the Amazon Comprehend service |
-| Amazon Comprehend | `PII_REDACT_AWS_SECRET_ACCESS_KEY` | The secret for the Amazon Comprehend service |
-
-## Testing the tool locally
-
-### Setting up a dev environment
-
-Setup a virtual environment and activate it
-
-```commandline
-python -m venv .venv
-source .venv/bin/activate
-```
-
-Install the dependencies for the tool
-
-```commandline
-pip install -r requirements.txt
-```
-
-To use the local development version of the [unstract-sdk](https://pypi.org/project/unstract-sdk/) install it from the local repository.
-Replace the path with the path to your local repository
-
-```commandline
-pip install -e ~/path_to_repo/sdks/.
-```
-
-### Tool execution preparation
-
-Load the environment variables for the tool.
-Make a copy of the `sample.env` file and name it `.env`. Fill in the required values.
-They get loaded with [python-dotenv](https://pypi.org/project/python-dotenv/) through the SDK.
-
-Update the tool's `data_dir` marked by the `TOOL_DATA_DIR` env. This has to be done before each tool execution since
-the tool updates the `INFILE` and `METADATA.json`.
-
-### Run SPEC command
-
-Represents the JSON schema for the runtime configurable `settings` of a tool
-
-```commandline
-python main.py --command SPEC
-```
-
-### Run PROPERTIES command
-
-Describes some metadata for the tool such as its `version`, `description`, `inputs` and `outputs`
-
-```commandline
-python main.py --command PROPERTIES
-```
-
-### Run ICON command
-
-Returns the SVG icon for the tool, used by Unstract's frontend
-
-```commandline
-python main.py --command ICON
-```
-
-### Run VARIABLES command
-
-Represents the runtime variables or envs that will be used by the tool
-
-```commandline
-python main.py --command VARIABLES
-```
-
-### Run RUN command
-
-The schema of the JSON required for settings can be found by running the [SPEC](#run-spec-command) command. Alternatively if you have access to the code base, it is located in the `config` folder as `spec.json`.
-
-```commandline
-python main.py \
- --command RUN \
- --settings '{
- "processor": "Amazon Comprehend",
- "redactionElements": [],
- "useCache": true,
- "scoreThreshold": 0.8
- }' \
- --log-level DEBUG
-
-```
-
-## Testing the tool from its docker image
-
-Build the tool docker image from the folder containing the `Dockerfile` with
-
-```commandline
-docker build -t unstract/tool-doc-pii-redactor:0.0.1 .
-```
-
-Make sure the directory pointed by `TOOL_DATA_DIR` has the required information for the tool to run and
-necessary services like the `platform-service` is up.
-To test the tool from its docker image, run the following command
-
-```commandline
-docker run -it \
- --network unstract-network \
- --env-file .env \
- -v "$(pwd)"/data_dir:/app/data_dir \
- unstract/tool-doc-pii-redactor:0.0.1 \
- --command RUN \
- --settings '{
- "processor": "Amazon Comprehend",
- "redactionElements": [],
- "useCache": true,
- "scoreThreshold": 0.8
- }' \
- --log-level DEBUG
-
-```
diff --git a/tools/doc_pii_redactor/__init__.py b/tools/doc_pii_redactor/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tools/doc_pii_redactor/requirements.txt b/tools/doc_pii_redactor/requirements.txt
deleted file mode 100644
index 4c4763fe1..000000000
--- a/tools/doc_pii_redactor/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# Add your dependencies here
-boto3==1.26.161
-nanoid==2.0.0
-requests==2.31.0
-unstructured[all-docs]==0.10.10
-
-# Required for all unstract tools
-unstract-sdk~=0.10.0
diff --git a/tools/doc_pii_redactor/sample.env b/tools/doc_pii_redactor/sample.env
deleted file mode 100644
index 5ff61d3bc..000000000
--- a/tools/doc_pii_redactor/sample.env
+++ /dev/null
@@ -1,13 +0,0 @@
-# Keys for Platform service
-PLATFORM_SERVICE_HOST=
-PLATFORM_SERVICE_PORT=
-PLATFORM_SERVICE_API_KEY=
-
-# Keys for Amazon Comprehend (to redact PII)
-PII_REDACT_AWS_REGION=
-PII_REDACT_AWS_ACCESS_KEY_ID=
-PII_REDACT_AWS_SECRET_ACCESS_KEY=
-
-# Keys for Document service
-DOCUMENT_PROCESSOR_URL=
-DOCUMENT_PROCESSOR_API_KEY=
diff --git a/tools/doc_pii_redactor/src/config/icon.svg b/tools/doc_pii_redactor/src/config/icon.svg
deleted file mode 100644
index 6c5bac223..000000000
--- a/tools/doc_pii_redactor/src/config/icon.svg
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
diff --git a/tools/doc_pii_redactor/src/config/properties.json b/tools/doc_pii_redactor/src/config/properties.json
deleted file mode 100644
index 62043879f..000000000
--- a/tools/doc_pii_redactor/src/config/properties.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
- "schemaVersion": "0.0.1",
- "displayName": "PII Redactor",
- "functionName": "pii_redactor",
- "toolVersion": "0.0.1",
- "description": "This is a tool which can redact the PII (Personally Identifiable Information) data or information from a file.",
- "input": {
- "description": "File whose PII needs to be redacted"
- },
- "output": {
- "description": "Creates a PDF file with PII redacted from it"
- },
- "result": {
- "type": "TXT",
- "description": "Response containing the text with redacted PII"
- },
- "ioCompatibility": {
- "api": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {
- "sync": false
- }
- },
- "file": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {}
- },
- "db": {
- "destinationSupport": false,
- "additionalArgs": {}
- }
- },
- "restrictions": {
- "maxFileSize": "10MB",
- "allowedFileTypes": [
- "txt",
- "pdf",
- "doc",
- "docx",
- "odt"
- ]
- }
-}
diff --git a/tools/doc_pii_redactor/src/config/runtime_variables.json b/tools/doc_pii_redactor/src/config/runtime_variables.json
deleted file mode 100644
index a7cf1b562..000000000
--- a/tools/doc_pii_redactor/src/config/runtime_variables.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
- "title": "Runtime Variables",
- "description": "Runtime Variables for doc pii redactor",
- "type": "object",
- "required": [
- "PII_REDACT_AWS_REGION",
- "PII_REDACT_AWS_ACCESS_KEY_ID",
- "PII_REDACT_AWS_SECRET_ACCESS_KEY",
- "DOCUMENT_PROCESSOR_URL",
- "DOCUMENT_PROCESSOR_API_KEY"
- ],
- "properties": {
- "PII_REDACT_AWS_REGION": {
- "type": "string",
- "title": "AWS Comprehend's Region",
- "description": "AWS Comprehend's region used in Document PII Redactor"
- },
- "PII_REDACT_AWS_ACCESS_KEY_ID": {
- "type": "string",
- "title": "AWS Comprehend's Access Key ID",
- "description": "AWS Comprehend's access key ID used in Document PII Redactor"
- },
- "PII_REDACT_AWS_SECRET_ACCESS_KEY": {
- "type": "string",
- "title": "AWS Comprehend's Secret Access Key",
- "format": "password",
- "description": "AWS Comprehend's secret access key used in Document PII Redactor"
- },
- "DOCUMENT_PROCESSOR_URL": {
- "type": "string",
- "title": "Document Processor URL",
- "description": "Document processor service URL"
- },
- "DOCUMENT_PROCESSOR_API_KEY": {
- "type": "string",
- "title": "Document Processor API key",
- "description": "Document processor service API key"
- }
- }
-}
diff --git a/tools/doc_pii_redactor/src/config/spec.json b/tools/doc_pii_redactor/src/config/spec.json
deleted file mode 100644
index 54490fe4d..000000000
--- a/tools/doc_pii_redactor/src/config/spec.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
- "title": "PII Redactor tool settings",
- "description": "Setup the document PII redactor tool",
- "type": "object",
- "required": [
- "processor"
- ],
- "properties": {
- "processor": {
- "type": "string",
- "title": "Processor to use",
- "default": "Amazon Comprehend",
- "enum": [
- "Amazon Comprehend"
- ],
- "description": "Service used to detect PII entities"
- },
- "redactionElements": {
- "type": "array",
- "title": "PII elements to redact",
- "description": "PII elements to redact. Leave it empty to process all elements",
- "items": {
- "type": "string",
- "enum": [
- "Address",
- "Age",
- "Bank Account Number",
- "Bank Routing Number",
- "Canada Health Number",
- "Canada Social Insurance Number",
- "Credit Card Number",
- "Credit Card CVV",
- "Credit Card Expiration Date",
- "Date & Time",
- "Email",
- "Indian Aadhaar Number",
- "Indian PAN Number",
- "Indian NREGA Number",
- "Indian Voter ID Number",
- "IP Address",
- "MAC Address",
- "Name",
- "Passport Number",
- "Password",
- "Phone Number",
- "PIN",
- "SSN",
- "URL",
- "UK National Insurance Number",
- "UK NHS Number",
- "UK Tax ID Number",
- "US Individual Taxpayer Identification Number",
- "Username",
- "VIN"
- ]
- },
- "default": []
- },
- "useCache": {
- "type": "boolean",
- "title": "Cache and use cached results",
- "default": true,
- "description": "Use cached results"
- },
- "scoreThreshold": {
- "type": "number",
- "title": "Score Threshold",
- "default": 0.8,
- "description": "Minimum confidence needed to treat an entity as PII (from 0.0 to 1.0)"
- }
- }
-}
diff --git a/tools/doc_pii_redactor/src/doc_pii_redactor/__init__.py b/tools/doc_pii_redactor/src/doc_pii_redactor/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tools/doc_pii_redactor/src/doc_pii_redactor/constants.py b/tools/doc_pii_redactor/src/doc_pii_redactor/constants.py
deleted file mode 100644
index 7ee274ba1..000000000
--- a/tools/doc_pii_redactor/src/doc_pii_redactor/constants.py
+++ /dev/null
@@ -1,7 +0,0 @@
-class EnvKey:
- DOC_PROCESSOR_URL = "DOCUMENT_PROCESSOR_URL"
- DOC_PROCESSOR_API_KEY = "DOCUMENT_PROCESSOR_API_KEY"
-
-
-class DocProcessorConstants:
- REQUEST_TIMEOUT = 600
diff --git a/tools/doc_pii_redactor/src/doc_pii_redactor/enums.py b/tools/doc_pii_redactor/src/doc_pii_redactor/enums.py
deleted file mode 100644
index f1c69f84a..000000000
--- a/tools/doc_pii_redactor/src/doc_pii_redactor/enums.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from enum import Enum
-
-
-class Processor(Enum):
- AMAZON_COMPREHEND = "Amazon Comprehend"
- UNSTRACT_PII = "Unstract PII Redactor"
diff --git a/tools/doc_pii_redactor/src/doc_pii_redactor/helper.py b/tools/doc_pii_redactor/src/doc_pii_redactor/helper.py
deleted file mode 100644
index 53da033df..000000000
--- a/tools/doc_pii_redactor/src/doc_pii_redactor/helper.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import json
-import re
-from typing import Any
-
-import boto3
-from unstract.sdk.constants import LogLevel
-from unstract.sdk.tool.base import BaseTool
-from unstract.sdk.utils import ToolUtils
-
-from .enums import Processor
-
-
-class PIIRedactHelper:
- AMAZON_COMPREHEND_TYPE_MAPPING = {
- "Address": "ADDRESS",
- "Age": "AGE",
- "Bank Account Number": "BANK_ACCOUNT_NUMBER",
- "Bank Routing Number": "BANK_ROUTING",
- "Canada Health Number": "CA_HEALTH_NUMBER",
- "Canada Social Insurance Number": "CA_SOCIAL_INSURANCE_NUMBER",
- "Credit Card Number": "CREDIT_DEBIT_NUMBER",
- "Credit Card CVV": "CREDIT_DEBIT_CVV",
- "Credit Card Expiration Date": "CREDIT_DEBIT_EXPIRY",
- "Date & Time": "DATE_TIME",
- "Email": "EMAIL",
- "Indian Aadhaar Number": "IN_AADHAAR",
- "Indian PAN Number": "IN_PERMANENT_ACCOUNT_NUMBER",
- "Indian NREGA Number": "IN_NREGA",
- "Indian Voter ID Number": "IN_VOTER_NUMBER",
- "IP Address": "IP_ADDRESS",
- "MAC Address": "MAC_ADDRESS",
- "Name": "NAME",
- "Passport Number": "PASSPORT_NUMBER",
- "Password": "PASSWORD",
- "Phone Number": "PHONE",
- "PIN": "PIN",
- "SSN": "SSN",
- "URL": "URL",
- "UK National Insurance Number": "UK_NATIONAL_INSURANCE_NUMBER",
- "UK NHS Number": "UK_NATIONAL_HEALTH_SERVICE_NUMBER",
- "UK Tax ID Number": "UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER",
- "US Individual Taxpayer Identification Number": "US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER", # noqa
- "Username": "USERNAME",
- "VIN": "VEHICLE_IDENTIFICATION_NUMBER",
- }
-
- def __init__(self, tool: BaseTool) -> None:
- self.tool = tool
-
- def get_cache_key(
- self, workflow_id: str, settings: dict[str, Any], input_text: str
- ) -> str:
- """Returns a unique cache key for an input.
-
- Args:
- workflow_id (str): UUID for a project
- settings (dict): Tool settings
- input_text (str): Text from the file read
-
- Returns:
- str: Unique key to set or get from cache
- """
- input_text_hash = ToolUtils.hash_str(input_text)
- settings_hash = ToolUtils.hash_str(json.dumps(settings))
- return f"cache:{workflow_id}:{settings_hash}:{input_text_hash}"
-
- @staticmethod
- def create_redaction_overlay(text: str) -> str:
- return "x" * len(text)
-
- def detect_pii_entities(
- self,
- text: str,
- processor: str,
- redact_items: list[str],
- score_threshold: float,
- ) -> list[str]:
- """Detects PII entites to be redacted.
-
- Types of PII entities: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/comprehend/client/detect_pii_entities.html # noqa
- 'BANK_ACCOUNT_NUMBER | 'BANK_ROUTING' | 'CREDIT_DEBIT_NUMBER' |
- 'CREDIT_DEBIT_CVV' | 'CREDIT_DEBIT_EXPIRY' | 'PIN' | 'EMAIL' | 'ADDRESS' | 'NAME' | 'PHONE' | 'SSN' | # noqa
- 'DATE_TIME' | 'PASSPORT_NUMBER' | 'DRIVER_ID' | 'URL' | 'AGE' | 'USERNAME' | 'PASSWORD' | 'AWS_ACCESS_KEY' | # noqa
- 'AWS_SECRET_KEY' | 'IP_ADDRESS' | 'MAC_ADDRESS' | 'ALL' | 'LICENSE_PLATE' | 'VEHICLE_IDENTIFICATION_NUMBER' | # noqa
- 'UK_NATIONAL_INSURANCE_NUMBER' | 'CA_SOCIAL_INSURANCE_NUMBER' | 'US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER' | # noqa
- 'UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER' | 'IN_PERMANENT_ACCOUNT_NUMBER' | 'IN_NREGA' | # noqa
- 'INTERNATIONAL_BANK_ACCOUNT_NUMBER' | 'SWIFT_CODE' | 'UK_NATIONAL_HEALTH_SERVICE_NUMBER' | 'CA_HEALTH_NUMBER' | # noqa
- 'IN_AADHAAR' | 'IN_VOTER_NUMBER'
-
- Args:
- text (str): Input text to check
- processor (str): The processor that has to be used
- redact_items (list[str]): Elements to be redacted
- score_threshold (float): The confidence used to decide if entity is PII
-
- Returns:
- list[str]: List of entities to be redacted
- """
- if processor == Processor.AMAZON_COMPREHEND.value:
- aws_region_name = self.tool.get_env_or_die("PII_REDACT_AWS_REGION")
- access_key = self.tool.get_env_or_die(
- "PII_REDACT_AWS_ACCESS_KEY_ID"
- )
- secret_key = self.tool.get_env_or_die(
- "PII_REDACT_AWS_SECRET_ACCESS_KEY"
- )
-
- try:
- comprehend = boto3.client(
- "comprehend",
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- region_name=aws_region_name,
- )
- response = comprehend.detect_pii_entities(
- Text=text, LanguageCode="en"
- )
- except Exception as e:
- self.tool.stream_log(
- f"Error detecting PII elements: {e}", LogLevel.ERROR
- )
- exit(1)
- else:
- mapped_redact_items = []
- for item in redact_items:
- if item in self.AMAZON_COMPREHEND_TYPE_MAPPING:
- mapped_redact_items.append(
- self.AMAZON_COMPREHEND_TYPE_MAPPING[item]
- )
- else:
- mapped_redact_items.append(item)
- self.tool.stream_log(f"Redact items: {mapped_redact_items}")
-
- pii_entities = response["Entities"]
- pii_texts = []
- for entity in pii_entities:
- if entity["Score"] < score_threshold:
- continue
- if len(mapped_redact_items) > 0:
- if entity["Type"] not in mapped_redact_items:
- continue
- start = entity["BeginOffset"]
- end = entity["EndOffset"]
- redact_text = text[start:end]
- redact_text = redact_text.replace("\n", "")
- redact_text = redact_text.strip()
- if redact_text not in pii_texts:
- pii_texts.append(redact_text)
-
- # Remove multiple spaces in the strings with a single space
- items_to_redact = [
- re.sub(" +", " ", item) for item in pii_texts
- ]
- items_to_redact = sorted(items_to_redact, key=len, reverse=True)
- return items_to_redact
- else:
- return []
diff --git a/tools/doc_pii_redactor/src/main.py b/tools/doc_pii_redactor/src/main.py
deleted file mode 100644
index 020bd117e..000000000
--- a/tools/doc_pii_redactor/src/main.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import io
-import json
-import sys
-from pathlib import Path
-from typing import Any
-
-import nanoid
-import requests
-from doc_pii_redactor.constants import DocProcessorConstants, EnvKey
-from doc_pii_redactor.enums import Processor
-from doc_pii_redactor.helper import PIIRedactHelper
-from unstract.sdk.cache import ToolCache
-from unstract.sdk.constants import LogState, MetadataKey, ToolEnv
-from unstract.sdk.platform import PlatformHelper
-from unstract.sdk.tool.base import BaseTool
-from unstract.sdk.tool.entrypoint import ToolEntrypoint
-from unstructured.partition.auto import partition
-
-
-class UnstractDocPIIRedactor(BaseTool):
- def validate(self, input_file: str, settings: dict[str, Any]) -> None:
- processor = settings["processor"]
- allowed_processors = [Processor.AMAZON_COMPREHEND.value]
- if processor not in allowed_processors:
- self.stream_error_and_exit(
- f"Invalid processor. Only {allowed_processors} is allowed"
- )
-
- def run(
- self,
- settings: dict[str, Any],
- input_file: str,
- output_dir: str,
- ) -> None:
- processor = settings["processor"]
- redact_items = settings["redactionElements"]
- use_cache = settings["useCache"]
- score_threshold = settings["scoreThreshold"]
- # Timeout set to 10mins, configure as necessary
- doc_processor_timeout = DocProcessorConstants.REQUEST_TIMEOUT
-
- pii_redact_helper = PIIRedactHelper(self)
-
- self.stream_log("Reading file...")
- text = self._extract_text(input_file)
- self.stream_log(f"Text length: {len(text)}")
-
- # Update GUI
- input_text_for_log = text
- if len(input_text_for_log) > 500:
- input_text_for_log = input_text_for_log[:500] + "...(truncated)"
- input_log = (
- f"Items to redact: `{redact_items}`\n\nInput text:\n\n"
- f"```text\n{input_text_for_log}\n```\n\n"
- )
- output_log = ""
- self.stream_update(input_log, state=LogState.INPUT_UPDATE)
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- entities_to_redact = None
- if use_cache:
- self.stream_log("Trying to retrieve from cache")
- cache_key = pii_redact_helper.get_cache_key(
- workflow_id=self.workflow_id, settings=settings, input_text=text
- )
- cache = ToolCache(
- tool=self,
- platform_host=self.get_env_or_die(ToolEnv.PLATFORM_HOST),
- platform_port=int(self.get_env_or_die(ToolEnv.PLATFORM_PORT)),
- )
- items_to_redact_str = cache.get(cache_key)
- if items_to_redact_str:
- entities_to_redact = json.loads(items_to_redact_str)
- self.stream_cost(cost=0.0, cost_units="cache")
-
- if not entities_to_redact:
- entities_to_redact = pii_redact_helper.detect_pii_entities(
- text, processor, redact_items, score_threshold
- )
- cost_units = 0.0
- cost_type = "free"
- if processor == Processor.AMAZON_COMPREHEND.value:
- cost_type = "amazon_comprehend_units"
- cost_units = len(text) / 100.0
- if cost_units < 3.0:
- cost_units = 3.0
- self.stream_cost(cost_units, cost_type)
-
- if use_cache and cache:
- cache.set(cache_key, json.dumps(entities_to_redact))
-
- self.stream_log(f"Entities to redact: {entities_to_redact}")
-
- output_log = (
- f"### Entities to redact\n\n```text\n{entities_to_redact}\n```\n\n"
- )
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- find_and_replace = {}
- for item in entities_to_redact:
- find_and_replace[item] = PIIRedactHelper.create_redaction_overlay(
- item
- )
-
- # Upload the file to the document processor
- document_processor_url = self.get_env_or_die(EnvKey.DOC_PROCESSOR_URL)
- document_processor_api_key = self.get_env_or_die(
- EnvKey.DOC_PROCESSOR_API_KEY
- )
- self.stream_log(f"Document processor URL: {document_processor_url}")
- self.stream_log("Uploading file to document processor")
- with open(input_file, "rb") as file:
- files = {"file": file}
- platform_helper = PlatformHelper(
- tool=self,
- platform_host=self.get_env_or_die(ToolEnv.PLATFORM_HOST),
- platform_port=int(self.get_env_or_die(ToolEnv.PLATFORM_PORT)),
- )
- platform_details = platform_helper.get_platform_details()
- if not platform_details:
- # Errors are logged by the SDK itself
- exit(1)
- account_id = platform_details.get("organization_id")
-
- file_name = nanoid.generate()
- url = (
- f"{document_processor_url}/upload?account_id={account_id}"
- f"&file_name={file_name}"
- )
- response = requests.post(
- url,
- files=files,
- headers={"Authorization": f"{document_processor_api_key}"},
- timeout=doc_processor_timeout,
- )
- if response.status_code != 200:
- self.stream_error_and_exit(
- "Error uploading file to document "
- f"processor: {response.status_code}"
- )
- self.stream_log("File uploaded to document processor")
-
- # Now perform the find and replace
- self.stream_log("Performing find and replace")
- url = (
- f"{document_processor_url}/find_and_replace?account_id={account_id}"
- f"&file_name={file_name}&output_format=pdf"
- )
- # The returned value from the document processor is a file
- self.stream_log(f"Find and replace: {find_and_replace}")
- response = requests.post(
- url,
- headers={
- "Authorization": f"{document_processor_api_key}",
- "Content-Type": "application/json",
- },
- json=find_and_replace,
- timeout=doc_processor_timeout,
- )
- if response.status_code != 200:
- self.stream_error_and_exit(
- f"Error performing find and replace: {response.status_code}"
- )
-
- redacted_text = response.content
- # Write the redacted text to output file
- try:
- self.stream_log("Writing tool output")
- source_name = self.get_exec_metadata.get(MetadataKey.SOURCE_NAME)
- output_path = Path(output_dir) / source_name
- with open(output_path, "wb") as f:
- f.write(redacted_text)
- except Exception as e:
- self.stream_error_and_exit(f"Error creating output file: {e}")
-
- self.write_tool_result(data="PII redacted successfully")
-
- def _extract_text(self, file: str) -> str:
- """Extract text from file.
-
- Args:
- file (str): The path to the input file
-
- Returns:
- str: page content
- """
- try:
- with open(file, mode="rb") as input_file_obj:
- bytes_io = io.BytesIO(input_file_obj.read())
- elements = partition(file=bytes_io)
- except Exception as e:
- self.stream_error_and_exit(f"Error partitioning file: {e}")
- text = "\n\n".join([str(el) for el in elements])
- return text
-
-
-if __name__ == "__main__":
- args = sys.argv[1:]
- tool = UnstractDocPIIRedactor.from_tool_args(args=args)
- ToolEntrypoint.launch(tool=tool, args=args)
diff --git a/tools/indexer/.dockerignore b/tools/indexer/.dockerignore
deleted file mode 100644
index c26352afc..000000000
--- a/tools/indexer/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-venv/
-.venv/
-.env
diff --git a/tools/indexer/Dockerfile b/tools/indexer/Dockerfile
deleted file mode 100644
index 669050cac..000000000
--- a/tools/indexer/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-FROM python:3.9-slim
-
-LABEL maintainer="Zipstack Inc."
-ENV UNSTRACT_ENTRYPOINT "python /app/src/main.py"
-
-# Install dependencies for SDK
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
- ffmpeg libsm6 libxext6 libmagic-dev poppler-utils \
- tesseract-ocr libreoffice pandoc \
- freetds-dev freetds-bin \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache-dir -U pip
-# Set the working directory in the container
-WORKDIR /app
-COPY requirements.txt /app/
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the contents of your project directory into the container at /app
-COPY src /app/src/
-WORKDIR /app/src
-
-
-ENTRYPOINT ["python", "main.py"]
diff --git a/tools/indexer/README.md b/tools/indexer/README.md
deleted file mode 100644
index 7fcaa3ef1..000000000
--- a/tools/indexer/README.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# Document Indexer
-
-The document indexer tool creates embeddings for documents in the platform. The embeddings are stored in a vector store.
-Both the embeddings generator and the vector store are configurable.
-
-## Required environment variables
-
-| Variable | Description |
-| -------------------------- | --------------------------------------------------------------------- |
-| `PLATFORM_SERVICE_HOST` | The host in which the platform service is running |
-| `PLATFORM_SERVICE_PORT` | The port in which the service is listening |
-| `PLATFORM_SERVICE_API_KEY` | The API key for the platform |
-| `TOOL_DATA_DIR` | The directory in the filesystem which has contents for tool execution |
-| `X2TEXT_HOST` | The host where the x2text service is running |
-| `X2TEXT_PORT` | The port where the x2text service is listening |
-
-## Testing the tool locally
-
-### Setting up a dev environment
-
-Setup a virtual environment and activate it
-
-```commandline
-python -m venv .venv
-source .venv/bin/activate
-```
-
-Install the dependencies for the tool
-
-```commandline
-pip install -r requirements.txt
-```
-
-To use the local development version of the [unstract-sdk](https://pypi.org/project/unstract-sdk/) install it from the local repository.
-Replace the path with the path to your local repository
-
-```commandline
-pip install -e ~/path_to_repo/sdks/.
-```
-
-### Tool execution preparation
-
-Load the environment variables for the tool.
-Make a copy of the `sample.env` file and name it `.env`. Fill in the required values.
-They get loaded with [python-dotenv](https://pypi.org/project/python-dotenv/) through the SDK.
-
-Update the tool's `data_dir` marked by the `TOOL_DATA_DIR` env. This has to be done before each tool execution since the tool updates the `INFILE` and `METADATA.json`.
-
-### Run SPEC command
-
-Represents the JSON schema for the runtime configurable `settings` of a tool
-
-```commandline
-python main.py --command SPEC
-```
-
-### Run PROPERTIES command
-
-Describes some metadata for the tool such as its `version`, `description`, `inputs` and `outputs`
-
-```commandline
-python main.py --command PROPERTIES
-```
-
-### Run ICON command
-
-Returns the SVG icon for the tool, used by Unstract's frontend
-
-```commandline
-python main.py --command ICON
-```
-
-### Run VARIABLES command
-
-Represents the runtime variables or envs that will be used by the tool
-
-```commandline
-python main.py --command VARIABLES
-```
-
-### Run RUN command
-
-The schema of the JSON required for settings can be found by running the [SPEC](#run-spec-command) command. Alternatively if you have access to the code base, it is located in the `config` folder as `spec.json`.
-
-```commandline
-python main.py \
- --command RUN \
- --settings '{
- "chunkSize": 1024,
- "chunkOverlap": 64,
- "embeddingSuffix": "azureopenai",
- "reIndex": true
- "embeddingAdapterId": "53eb2bac-c29e-4ce2-89b4-5a1c22a523c7",
- "vectorDbAdapterId": "66aa4d9c-1703-4126-b0da-d0c78b56ff9c"
- }' \
- --log-level DEBUG
-
-```
-
-## Testing the tool from its docker image
-
-Build the tool docker image from the folder containing the `Dockerfile` with
-
-```commandline
-docker build -t unstract/tool-indexer:0.0.1 .
-```
-
-Make sure the directory pointed by `TOOL_DATA_DIR` has the required information for the tool to run and
-necessary services like the `platform-service` is up.
-To test the tool from its docker image, run the following command
-
-```commandline
-docker run -it \
- --network unstract-network \
- --env-file .env \
- -v "$(pwd)"/data_dir:/app/data_dir \
- unstract/tool-indexer:0.0.1 \
- --command RUN \
- --settings '{
- "chunkSize": 1024,
- "chunkOverlap": 64,
- "embeddingSuffix": "azureopenai",
- "reIndex": true
- }' \
- --log-level DEBUG
-
-```
diff --git a/tools/indexer/__init__.py b/tools/indexer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tools/indexer/requirements.txt b/tools/indexer/requirements.txt
deleted file mode 100644
index 9faca9b40..000000000
--- a/tools/indexer/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# Add your dependencies here
-
-# Required for all unstract tools
-unstract-sdk~=0.13.0
diff --git a/tools/indexer/sample.env b/tools/indexer/sample.env
deleted file mode 100644
index 677efa2dc..000000000
--- a/tools/indexer/sample.env
+++ /dev/null
@@ -1,7 +0,0 @@
-PLATFORM_SERVICE_HOST=http://unstract-platform-service
-PLATFORM_SERVICE_PORT=3001
-PLATFORM_SERVICE_API_KEY=
-TOOL_DATA_DIR=../data_dir
-
-X2TEXT_HOST=http://unstract-x2text-service
-X2TEXT_PORT=3004
diff --git a/tools/indexer/src/config/icon.svg b/tools/indexer/src/config/icon.svg
deleted file mode 100644
index 2054618a2..000000000
--- a/tools/indexer/src/config/icon.svg
+++ /dev/null
@@ -1,49 +0,0 @@
-
-
diff --git a/tools/indexer/src/config/properties.json b/tools/indexer/src/config/properties.json
deleted file mode 100644
index d2376b378..000000000
--- a/tools/indexer/src/config/properties.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
- "schemaVersion": "0.0.1",
- "displayName": "Document Indexer",
- "functionName": "document_indexer",
- "toolVersion": "0.0.2",
- "description": "Used to index documents in the specified vector DB / embedding.",
- "input": {
- "description": "Document that needs to be indexed"
- },
- "output": {
- "description": "No files written"
- },
- "result": {
- "type": "TXT",
- "description": "Result of the indexing process"
- },
- "adapter": {
- "languageModels": [
- {
- "isEnabled": false
- }
- ],
- "embeddingServices": [
- {
- "adapterId": "embeddingAdapterId",
- "isEnabled": true,
- "title": "Embedding to use",
- "isRequired": true,
- "description": "Embedding service to use for indexing"
- }
- ],
- "vectorStores": [
- {
- "adapterId": "vectorDbAdapterId",
- "isEnabled": true,
- "title": "Vector DB to use",
- "isRequired": true,
- "description": "Vector DB to use for indexing"
- }
- ],
- "textExtractors": [
- {
- "adapterId": "x2TextAdapterId",
- "isEnabled": true,
- "title": "Text extractor to use",
- "isRequired": true,
- "description": "Text extractor to use for indexing"
- }
- ]
- },
- "ioCompatibility": {
- "api": {
- "sourceSupport": true,
- "destinationSupport": false,
- "additionalArgs": {
- "sync": true
- }
- },
- "file": {
- "sourceSupport": true,
- "destinationSupport": false,
- "additionalArgs": {}
- },
- "db": {
- "destinationSupport": false,
- "additionalArgs": {}
- }
- },
- "restrictions": {
- "maxFileSize": "50 MB",
- "allowedFileTypes": [
- "txt",
- "pdf",
- "doc",
- "docx",
- "odt"
- ]
- }
-}
diff --git a/tools/indexer/src/config/runtime_variables.json b/tools/indexer/src/config/runtime_variables.json
deleted file mode 100644
index 5bf0fc56b..000000000
--- a/tools/indexer/src/config/runtime_variables.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
- "title": "Runtime Variables",
- "description": "Runtime Variables for Document Indexer",
- "type": "object",
- "required": [],
- "properties": {}
-}
diff --git a/tools/indexer/src/config/spec.json b/tools/indexer/src/config/spec.json
deleted file mode 100644
index 71e8ebff2..000000000
--- a/tools/indexer/src/config/spec.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "title": "Document Indexer",
- "description": "Index documents based on their semantic content",
- "type": "object",
- "required": [],
- "properties": {
- "chunkSize": {
- "type": "number",
- "title": "Chunk size",
- "default": 1024,
- "description": "Size of chunks to be considered for the embedding. Smaller chunk size means embeddings are more precise, while larger chunk size means that the embeddings may be more general, but can miss fine-grained details."
- },
- "chunkOverlap": {
- "type": "number",
- "title": "Chunk overlap",
- "default": 128,
- "description": "Overlap to be considered for the embedding."
- },
- "reIndex": {
- "type": "boolean",
- "title": "Re-Index",
- "default": false,
- "description": "Re-index files every time (useful during development)"
- }
- }
-}
diff --git a/tools/indexer/src/constants.py b/tools/indexer/src/constants.py
deleted file mode 100644
index 52cb7ab8d..000000000
--- a/tools/indexer/src/constants.py
+++ /dev/null
@@ -1,5 +0,0 @@
-class SettingsKeys:
- CHUNK_SIZE = "chunkSize"
- CHUNK_OVERLAP = "chunkSize"
- REINDEX = "reIndex"
- EMBEDDING_SUFFIX = "embeddingSuffix"
diff --git a/tools/indexer/src/main.py b/tools/indexer/src/main.py
deleted file mode 100644
index 573879319..000000000
--- a/tools/indexer/src/main.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import sys
-from typing import Any
-
-from unstract.sdk.constants import LogState, MetadataKey, ToolSettingsKey
-from unstract.sdk.index import ToolIndex
-from unstract.sdk.tool.base import BaseTool
-from unstract.sdk.tool.entrypoint import ToolEntrypoint
-
-from .constants import SettingsKeys
-
-
-class DocumentIndexer(BaseTool):
- def run(
- self,
- settings: dict[str, Any],
- input_file: str,
- output_dir: str,
- ) -> None:
- # Update GUI
- input_log = (
- "### Indexing file\n"
- "```text\n"
- f"- Chunk Size: {settings[SettingsKeys.CHUNK_SIZE]}\n"
- f"- Chunk Overlap: {settings[SettingsKeys.CHUNK_OVERLAP]}\n"
- f"- Re-index: {settings[SettingsKeys.REINDEX]}\n"
- "```\n\n"
- )
- output_log = ""
- self.stream_update(input_log, state=LogState.INPUT_UPDATE)
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- file_hash = self.get_exec_metadata.get(MetadataKey.SOURCE_HASH)
- if not file_hash:
- raise RuntimeError("Source hash missing in metadata")
- tool_index = ToolIndex(tool=self)
- self.stream_log("Indexing document...")
- try:
- index_key = tool_index.index_file(
- tool_id=self.workflow_id,
- embedding_type=settings[ToolSettingsKey.EMBEDDING_ADAPTER_ID],
- vector_db=settings[ToolSettingsKey.VECTOR_DB_ADAPTER_ID],
- x2text_adapter=settings[ToolSettingsKey.X2TEXT_ADAPTER_ID],
- file_path=input_file,
- file_hash=file_hash,
- chunk_size=settings[SettingsKeys.CHUNK_SIZE],
- chunk_overlap=settings[SettingsKeys.CHUNK_OVERLAP],
- reindex=settings[SettingsKeys.REINDEX],
- )
- except Exception as e:
- self.stream_error_and_exit(f"Error fetching data and indexing: {e}")
- # Update GUI
- input_log = (
- "### Indexing file\n"
- "```text\n"
- f"- Chunk Size: {settings[SettingsKeys.CHUNK_SIZE]}\n"
- f"- Chunk Overlap: {settings[SettingsKeys.CHUNK_OVERLAP]}\n"
- f"- Re-index: {settings[SettingsKeys.REINDEX]}\n"
- "```\n\n"
- )
- output_log = f"### Index results\n File indexed against key {index_key}"
- self.stream_update(input_log, state=LogState.INPUT_UPDATE)
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- self.write_tool_result(data=f"File indexed successfully at {index_key}")
-
-
-if __name__ == "__main__":
- args = sys.argv[1:]
- tool = DocumentIndexer.from_tool_args(args=args)
- ToolEntrypoint.launch(tool=tool, args=args)
diff --git a/tools/ocr/.dockerignore b/tools/ocr/.dockerignore
deleted file mode 100644
index c26352afc..000000000
--- a/tools/ocr/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-venv/
-.venv/
-.env
diff --git a/tools/ocr/Dockerfile b/tools/ocr/Dockerfile
deleted file mode 100644
index 4c95f3d4d..000000000
--- a/tools/ocr/Dockerfile
+++ /dev/null
@@ -1,21 +0,0 @@
-FROM python:3.9-slim
-
-LABEL maintainer="Zipstack Inc."
-ENV UNSTRACT_ENTRYPOINT "python /app/src/main.py"
-
-# Install dependencies for unstructured library's partition
-RUN apt-get update && apt-get --no-install-recommends -y install libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc\
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache-dir -U pip
-# Set the working directory in the container
-WORKDIR /app
-COPY requirements.txt /app/
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the contents of your project directory into the container at /app
-COPY src /app/src/
-WORKDIR /app/src
-
-
-ENTRYPOINT ["python", "main.py"]
diff --git a/tools/ocr/README.md b/tools/ocr/README.md
deleted file mode 100644
index 5b0d49411..000000000
--- a/tools/ocr/README.md
+++ /dev/null
@@ -1,112 +0,0 @@
-## Document OCR processor
-
-This is tool which can be used to extract text from images. Perform OCR operations on images.
-
-### Required environment variables
-
-| Variable | Description |
-| -------------------------- | --------------------------------------------------------------------- |
-| `PLATFORM_SERVICE_HOST` | The host in which the platform service is running |
-| `PLATFORM_SERVICE_PORT` | The port in which the service is listening |
-| `PLATFORM_SERVICE_API_KEY` | The API key for the platform |
-| `TOOL_DATA_DIR` | The directory in the filesystem which has contents for tool execution |
-
-### Testing the tool locally
-
-#### Setting up a dev environment
-Setup a virtual environment and activate it
-
-```commandline
-python -m venv .venv
-source .venv/bin/activate
-```
-
-Install the dependencies for the tool
-
-```commandline
-pip install -r requirements.txt
-```
-
-To use the local development version of the [unstract-sdk](https://pypi.org/project/unstract-sdk/) install it from the local repository.
-Replace the path with the path to your local repository
-
-```commandline
-pip install -e ~/path_to_repo/sdks/.
-```
-
-#### Tool execution preparation
-
-Load the environment variables for the tool.
-Make a copy of the `sample.env` file and name it `.env`. Fill in the required values.
-They get loaded with [python-dotenv](https://pypi.org/project/python-dotenv/) through the SDK.
-
-Update the tool's `data_dir` marked by the `TOOL_DATA_DIR` env. This has to be done before each tool execution since the tool updates the `INFILE` and `METADATA.json`.
-
-#### Run SPEC command
-
-Represents the JSON schema for the runtime configurable `settings` of a tool
-```commandline
-python main.py --command SPEC
-```
-
-#### Run PROPERTIES command
-
-Describes some metadata for the tool such as its `version`, `description`, `inputs` and `outputs`
-```commandline
-python main.py --command PROPERTIES
-```
-
-#### Run ICON command
-
-Returns the SVG icon for the tool, used by Unstract's frontend
-```commandline
-python main.py --command ICON
-```
-
-#### Run VARIABLES command
-
-Represents the runtime variables or envs that will be used by the tool
-```commandline
-python main.py --command VARIABLES
-```
-
-#### Run RUN command
-
-The schema of the JSON required for settings can be found by running the [SPEC](#run-spec-command) command. Alternatively if you have access to the code base, it is located in the `config` folder as `spec.json`.
-
-
-```commandline
-python main.py \
- --command RUN \
- --settings '{
- "ocrAdapterId": ""
- }' \
- --workflow-id '00000000-0000-0000-0000-000000000000' \
- --log-level DEBUG
-
-```
-### Testing the tool from its docker image
-
-Build the tool docker image from the folder containing the `Dockerfile` with
-```commandline
-docker build -t unstract/tool-ocr:0.0.1 .
-```
-
-Make sure the directory pointed by `TOOL_DATA_DIR` has the required information for the tool to run and
-necessary services like the `unstract-platform-service` is up.
-To test the tool from its docker image, run the following command
-
-```commandline
-docker run -it \
- --network unstract-network \
- --env-file .env \
- -v "$(pwd)"/data_dir:/app/data_dir \
- unstract/tool-ocr:0.0.1 \
- --command RUN \
- --settings '{
- "ocrAdapterId": ""
- }' \
- --workflow-id '00000000-0000-0000-0000-000000000000' \
- --log-level DEBUG
-
-```
diff --git a/tools/ocr/requirements.txt b/tools/ocr/requirements.txt
deleted file mode 100644
index 1d381f5ee..000000000
--- a/tools/ocr/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# Add your dependencies here
-pypdf~=3.11.1
-argparse==1.4.0
-filetype==1.2.0
-requests==2.31.0
-pillow~=10.2.0
-
-# Required for all unstract tools
-unstract-sdk~=0.12.1
diff --git a/tools/ocr/sample.env b/tools/ocr/sample.env
deleted file mode 100644
index a0caeece0..000000000
--- a/tools/ocr/sample.env
+++ /dev/null
@@ -1,4 +0,0 @@
-PLATFORM_SERVICE_HOST=http://unstract-platform-service
-PLATFORM_SERVICE_PORT=3001
-PLATFORM_SERVICE_API_KEY=
-TOOL_DATA_DIR=../data_dir
diff --git a/tools/ocr/src/config/properties.json b/tools/ocr/src/config/properties.json
deleted file mode 100644
index 4c8c7554e..000000000
--- a/tools/ocr/src/config/properties.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
- "schemaVersion": "0.0.1",
- "displayName": "OCR",
- "functionName": "ocr",
- "toolVersion": "0.0.1",
- "description": "This is tool which can be used to extract text from images. Perform OCR operations on images.",
- "input": {
- "description": "The file on which OCR operation is to be performed"
- },
- "output": {
- "description": "Places the file into a folder (bin) that its OCR operation performed into."
- },
- "result": {
- "type": "JSON",
- "description": "JSON response containing the bin to which the file was classified into",
- "schema": {}
- },
- "adapter": {
- "ocrs": [
- {
- "isEnabled": true,
- "adapterId": "ocrAdapterId",
- "title": "OCR to use",
- "isRequired": true,
- "description": "OCR used to extract text from images"
- }
- ]
- },
- "ioCompatibility": {
- "api": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {
- "sync": true
- }
- },
- "file": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {}
- },
- "db": {
- "destinationSupport": true,
- "additionalArgs": {}
- }
- },
- "restrictions": {
- "maxFileSize": "10MB",
- "allowedFileTypes": [
- "jpeg",
- "png",
- "tiff",
- "bmp",
- "gif",
- "webp",
- "pdf"
- ]
- }
-}
diff --git a/tools/ocr/src/config/spec.json b/tools/ocr/src/config/spec.json
deleted file mode 100644
index 918d03237..000000000
--- a/tools/ocr/src/config/spec.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
- "title": "OCR Tool",
- "description": "Setup the OCR tool",
- "type": "object",
- "required": [],
- "properties": {
- "useCache": {
- "type": "boolean",
- "title": "Cache and use cached results",
- "default": true,
- "description": "Use cached results"
- }
- }
-}
\ No newline at end of file
diff --git a/tools/ocr/src/constants.py b/tools/ocr/src/constants.py
deleted file mode 100644
index 87e75f366..000000000
--- a/tools/ocr/src/constants.py
+++ /dev/null
@@ -1,18 +0,0 @@
-class FileType:
- TEXT_PLAIN = "text/plain"
- IMAGE_JPEG = "image/jpeg"
- IMAGE_PNG = "image/png"
- IMAGE_TIFF = "image/tiff"
- IMAGE_BMP = "image/bmp"
- IMAGE_GIF = "image/gif"
- IMAGE_WEBP = "image/webp"
- APPLICATION_PDF = "application/pdf"
- ALLOWED_TYPES = [
- IMAGE_JPEG,
- IMAGE_PNG,
- IMAGE_TIFF,
- IMAGE_BMP,
- IMAGE_GIF,
- IMAGE_WEBP,
- APPLICATION_PDF,
- ]
diff --git a/tools/ocr/src/enums.py b/tools/ocr/src/enums.py
deleted file mode 100644
index da748ef17..000000000
--- a/tools/ocr/src/enums.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from enum import Enum
-
-
-class CostUnits(Enum):
- CACHE = "Cache"
- GOOGLE_PAGES = "google/pages"
-
diff --git a/tools/ocr/src/helper.py b/tools/ocr/src/helper.py
deleted file mode 100644
index bb8a5b3b4..000000000
--- a/tools/ocr/src/helper.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import pypdf
-from constants import FileType
-from enums import CostUnits
-from unstract.sdk.cache import ToolCache
-from unstract.sdk.constants import LogLevel, ToolEnv
-from unstract.sdk.tool.base import BaseTool
-
-
-class OcrHelper:
- """Helper functions for Ocr tool."""
-
- def __init__(self, tool: BaseTool, use_cache: bool = False) -> None:
- self.cache: ToolCache = None
- self.use_cache = False
- self.tool = tool
-
- if use_cache:
- platform_host = self.tool.get_env_or_die(ToolEnv.PLATFORM_HOST)
- platform_port = self.tool.get_env_or_die(ToolEnv.PLATFORM_PORT)
- self.tool.stream_log("Check result in cache")
-
- self.cache = ToolCache(
- tool=self.tool,
- platform_host=platform_host,
- platform_port=int(platform_port),
- )
- self.use_cache = True
-
- def stream_error_and_exit(self, message: str) -> None:
- """Stream error log and exit.
-
- Args:
- message (str): Error message
- """
- self.tool.stream_log(message, level=LogLevel.ERROR)
- exit(1)
-
- def get_page_count(self, file: bytes, file_type_mime: str) -> int:
- """Count pages for billing purposes.
-
- Args:
- file (str): The path to the input file
- file_type_mime (str): The MIME type of the file
-
- Returns:
- int: page count
- """
- page_count = 1
- # Count pages in case of PDF for billing purposes
- if file_type_mime == FileType.APPLICATION_PDF:
- pdf_page_count = 0
- with open(file, mode="rb") as input_file_obj:
- pdf_reader = pypdf.PdfReader(input_file_obj)
- pdf_page_count = len(pdf_reader.pages)
- self.tool.stream_log(f"PDF page count: {pdf_page_count}")
- page_count = pdf_page_count
- return page_count
-
- def calculate_cost(
- self,
- file: bytes,
- file_type_mime: str,
- cached_result: bool = False,
- ) -> None:
- """Get cost and stream cost.
-
- Args:
- file (bytes): _description_
- file_type_mime (str): _description_
- """
- if cached_result:
- self.tool.stream_cost(
- cost=0.0,
- cost_units=CostUnits.CACHE.value,
- )
- else:
- page_count = self.get_page_count(
- file=file, file_type_mime=file_type_mime
- )
- self.tool.stream_cost(
- cost=float(page_count),
- cost_units=CostUnits.GOOGLE_PAGES.value,
- )
-
- def set_result_in_cache(
- self,
- key: str,
- result: str,
- cached_result: bool = False,
- ) -> None:
- """Get result from cache by the help of unstract Cache tool.
-
- Args:
- key (str): Cache key
-
- Required env variables:
- PLATFORM_HOST: Host of platform service
- PLATFORM_PORT: Port of platform service
- Returns:
- Optional[str]: result
- """
-
- if not self.use_cache:
- return None
- if not cached_result:
- self.cache.set(key, result)
-
- def stream_output_text_log(self, data: str) -> None:
- """Stream document text.
-
- Args:
- sql (str): _description_
- """
- data_text_for_log = "### OCR Output\n\n"
- if len(data) > 500:
- data_text_for_log = data[:500] + "...(truncated)"
- self.tool.stream_single_step_message(
- f"```json\n{data_text_for_log}\n```"
- )
-
- def time_taken(self, start_time: float, end_time: float) -> None:
- """Calculate Time difference.
-
- Args:
- start_time (float): _description_
- end_time (float): _description_
- """
- time_taken = end_time - start_time
- self.tool.stream_log(f"Time taken: {time_taken}")
diff --git a/tools/ocr/src/main.py b/tools/ocr/src/main.py
deleted file mode 100644
index 2f4fef7a2..000000000
--- a/tools/ocr/src/main.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import sys
-import time
-from pathlib import Path
-from typing import Any
-
-from helper import OcrHelper
-from unstract.sdk.cache import ToolCache
-from unstract.sdk.constants import LogLevel, LogState, MetadataKey, ToolEnv
-from unstract.sdk.ocr import OCR
-from unstract.sdk.tool.base import BaseTool
-from unstract.sdk.tool.entrypoint import ToolEntrypoint
-from unstract.sdk.tool.validator import ToolValidator
-from unstract.sdk.utils.tool_utils import ToolUtils
-
-
-class UnstractOCR(BaseTool):
- def __init__(self, log_level: str = LogLevel.INFO) -> None:
- super().__init__(log_level)
- self.helper = OcrHelper(tool=self)
- self.validator = ToolValidator(tool=self)
-
- def run(
- self,
- settings: dict[str, Any],
- input_file: str,
- output_dir: str,
- ) -> None:
- # Initializing Function Arguments
- use_cache = settings["useCache"]
- ocr_adapter_id = settings["ocrAdapterId"]
-
- # Set adapter
- tool_ocr = OCR(tool=self)
- ocr_adapter = tool_ocr.get_ocr(adapter_instance_id=ocr_adapter_id)
-
- # Read image file into memory
- with open(input_file, "rb") as image_file:
- image_content = image_file.read()
-
- input_file_type_mime = ToolUtils.get_file_mime_type(Path(input_file))
-
- # Construct an image object
- output_log = ""
- input_log = f"Input file: `{input_file}`\n\n"
- self.stream_update(input_log, state=LogState.INPUT_UPDATE)
-
- # Check cache
- content_hash = ToolUtils.get_hash_from_file(file_path=input_file)
- cache_key = (
- f"cache:{self.workflow_id}:{ocr_adapter.get_name()}:{content_hash}"
- )
-
- result_text = None
- if use_cache:
- cache = ToolCache(
- tool=self,
- platform_host=self.get_env_or_die(ToolEnv.PLATFORM_HOST),
- platform_port=int(self.get_env_or_die(ToolEnv.PLATFORM_PORT)),
- )
- cached_response = cache.get(cache_key)
- if cached_response is not None:
- result_text = cached_response
- self.stream_cost(cost=0.0, cost_units="cache")
- cached_result = True if result_text else False
- # Process if not cached
- if not result_text:
- result_text = ""
- t1 = time.time()
- result_text = ocr_adapter.process(input_file_path=input_file)
- t2 = time.time()
- self.helper.time_taken(start_time=t1, end_time=t2)
-
- # Pre-process and cache result
- self.helper.set_result_in_cache(
- key=cache_key, result=result_text, cached_result=cached_result
- )
- self.helper.calculate_cost(
- file=image_content,
- file_type_mime=input_file_type_mime,
- cached_result=cached_result,
- )
- self.helper.stream_output_text_log(result_text)
-
- # Write result to file
- self.stream_log("Writing tool output")
- source_name = self.get_exec_metadata.get(MetadataKey.SOURCE_NAME)
- output_path = Path(output_dir) / f"{Path(source_name).stem}.txt"
- ocr_adapter.process(input_file_path=input_file, output_file_path=output_path)
-
- # Log output
- if len(result_text) > 1000:
- output_log = (
- f"```text\n{result_text[:1000]}... (truncated)\n```\n\n"
- )
- else:
- output_log = f"```text\n{result_text}\n```\n\n"
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
- self.stream_single_step_message(output_log)
- self.write_tool_result(data=result_text)
-
-
-if __name__ == "__main__":
- args = sys.argv[1:]
- tool = UnstractOCR.from_tool_args(args=args)
- ToolEntrypoint.launch(tool=tool, args=args)
diff --git a/tools/translate/.dockerignore b/tools/translate/.dockerignore
deleted file mode 100644
index c26352afc..000000000
--- a/tools/translate/.dockerignore
+++ /dev/null
@@ -1,3 +0,0 @@
-venv/
-.venv/
-.env
diff --git a/tools/translate/Dockerfile b/tools/translate/Dockerfile
deleted file mode 100644
index e0c4f5ac9..000000000
--- a/tools/translate/Dockerfile
+++ /dev/null
@@ -1,19 +0,0 @@
-FROM python:3.9-slim
-LABEL maintainer="Zipstack Inc."
-ENV UNSTRACT_ENTRYPOINT "python /app/src/main.py"
-
-# Install dependencies for unstructured library's partition
-RUN apt-get update && apt-get --no-install-recommends -y install libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc\
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache-dir -U pip
-# Set the working directory in the container
-WORKDIR /app
-COPY requirements.txt /app/
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the contents of your project directory into the container at /app
-COPY src /app/src/
-WORKDIR /app/src
-
-ENTRYPOINT ["python", "main.py"]
diff --git a/tools/translate/README.md b/tools/translate/README.md
deleted file mode 100644
index 15eabc94c..000000000
--- a/tools/translate/README.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Translate Tool
-
-The translate tool can process input text from either TXT or PDF files, seamlessly converting it into the desired target language.
-
-## Supported processors
-
-### Google Translate
-
-Follow [the setup](https://cloud.google.com/translate/docs/setup) to enable Google Cloud's translation service.
-
-### Required environment variables
-
-| Variable | Description |
-| -------------------------- | --------------------------------------------------------------------- |
-| `PLATFORM_SERVICE_HOST` | The host in which the platform service is running |
-| `PLATFORM_SERVICE_PORT` | The port in which the service is listening |
-| `PLATFORM_SERVICE_API_KEY` | The API key for the platform |
-| `TOOL_DATA_DIR` | The directory in the filesystem which has contents for tool execution |
-
-Set the following envs based on the processor you wish to use
-
-| Processor | Variable | Description |
-| --- | --- | --- |
-| Google Translate | `GOOGLE_SERVICE_ACCOUNT` | Service account JSON for the Google Translate service|
-
-## Testing the tool locally
-
-### Setting up a dev environment
-
-Setup a virtual environment and activate it
-
-```commandline
-python -m venv .venv
-source .venv/bin/activate
-```
-
-Install the dependencies for the tool
-
-```commandline
-pip install -r requirements.txt
-```
-
-To use the local development version of the [unstract-sdk](https://pypi.org/project/unstract-sdk/) install it from the local repository.
-Replace the path with the path to your local repository
-
-```commandline
-pip install -e ~/path_to_repo/sdks/.
-```
-
-### Tool execution preparation
-
-Load the environment variables for the tool.
-Make a copy of the `sample.env` file and name it `.env`. Fill in the required values.
-They get loaded with [python-dotenv](https://pypi.org/project/python-dotenv/) through the SDK.
-
-Update the tool's `data_dir` marked by the `TOOL_DATA_DIR` env. This has to be done before each tool execution since the tool updates the `INFILE` and `METADATA.json`.
-
-### Run SPEC command
-
-Represents the JSON schema for the runtime configurable `settings` of a tool
-
-```commandline
-python main.py --command SPEC
-```
-
-### Run PROPERTIES command
-
-Describes some metadata for the tool such as its `version`, `description`, `inputs` and `outputs`
-
-```commandline
-python main.py --command PROPERTIES
-```
-
-### Run ICON command
-
-Returns the SVG icon for the tool, used by Unstract's frontend
-
-```commandline
-python main.py --command ICON
-```
-
-### Run VARIABLES command
-
-Represents the runtime variables or envs that will be used by the tool
-
-```commandline
-python main.py --command VARIABLES
-```
-
-### Run RUN command
-
-The schema of the JSON required for settings can be found by running the [SPEC](#run-spec-command) command. Alternatively if you have access to the code base, it is located in the `config` folder as `spec.json`.
-
-```commandline
-python main.py \
- --command RUN \
- --settings '{
- "processor": "Google Translate",
- "tragetLanguage": "Spanish",
- "useCache": true
- }' \
- --log-level DEBUG
-
-```
-
-## Testing the tool from its docker image
-
-Build the tool docker image from the folder containing the `Dockerfile` with
-
-```commandline
-docker build -t unstract/tool-translate:0.0.1 .
-```
-
-Make sure the directory pointed by `TOOL_DATA_DIR` has the required information for the tool to run and
-necessary services like the `platform-service` is up.
-To test the tool from its docker image, run the following command
-
-```commandline
-docker run -it \
- --network unstract-network \
- --env-file .env \
- -v "$(pwd)"/data_dir:/app/data_dir \
- unstract/tool-translate:0.0.1 \
- --command RUN \
- --settings '{
- "processor": "Google Translate",
- "tragetLanguage": "Spanish",
- "useCache": true
- }' \
- --log-level DEBUG
-
-```
diff --git a/tools/translate/__init__.py b/tools/translate/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tools/translate/requirements.txt b/tools/translate/requirements.txt
deleted file mode 100644
index e5009ca38..000000000
--- a/tools/translate/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# Add your dependencies here
-google-cloud-translate~=3.14.0
-unstructured[all-docs]==0.10.10
-
-# Required for all unstract tools
-unstract-sdk~=0.10.0
diff --git a/tools/translate/sample.env b/tools/translate/sample.env
deleted file mode 100644
index 4c99059eb..000000000
--- a/tools/translate/sample.env
+++ /dev/null
@@ -1,4 +0,0 @@
-PLATFORM_SERVICE_HOST=
-PLATFORM_SERVICE_PORT=
-PLATFORM_SERVICE_API_KEY=
-GOOGLE_SERVICE_ACCOUNT=
diff --git a/tools/translate/src/config/icon.svg b/tools/translate/src/config/icon.svg
deleted file mode 100644
index 3e85c9c86..000000000
--- a/tools/translate/src/config/icon.svg
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
diff --git a/tools/translate/src/config/properties.json b/tools/translate/src/config/properties.json
deleted file mode 100644
index f82839d38..000000000
--- a/tools/translate/src/config/properties.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
- "schemaVersion": "0.0.1",
- "displayName": "Translate",
- "functionName": "translate",
- "toolVersion": "0.0.1",
- "description": "This is a tool which can translate the given text into a target language.",
- "input": {
- "description": "File to be translated"
- },
- "output": {
- "description": "Creates a file with the translated text"
- },
- "result": {
- "type": "TXT",
- "description": "Response containing the translated text"
- },
- "ioCompatibility": {
- "api": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {
- "sync": true
- }
- },
- "file": {
- "sourceSupport": true,
- "destinationSupport": true,
- "additionalArgs": {}
- },
- "db": {
- "destinationSupport": false,
- "additionalArgs": {}
- }
- },
- "restrictions": {
- "maxFileSize": "10MB",
- "allowedFileTypes": [
- "*"
- ]
- }
-}
diff --git a/tools/translate/src/config/runtime_variables.json b/tools/translate/src/config/runtime_variables.json
deleted file mode 100644
index ad3aacce3..000000000
--- a/tools/translate/src/config/runtime_variables.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
- "title": "Runtime Variables",
- "description": "Runtime Variables for translate",
- "type": "object",
- "required": [
- "GOOGLE_SERVICE_ACCOUNT"
- ],
- "properties": {
- "GOOGLE_SERVICE_ACCOUNT": {
- "type": "string",
- "title": "Google Service Account",
- "description": "Google Service account"
- }
- }
-}
diff --git a/tools/translate/src/config/spec.json b/tools/translate/src/config/spec.json
deleted file mode 100644
index 1fc157758..000000000
--- a/tools/translate/src/config/spec.json
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "title": "Language translation tool settings",
- "description": "Setup the translation tool",
- "type": "object",
- "required": [],
- "properties": {
- "processor": {
- "type": "string",
- "title": "Processor to use",
- "default": "Google Translate",
- "enum": [
- "Google Translate"
- ],
- "description": "Service used to translate text"
- },
- "targetLanguage": {
- "type": "string",
- "title": "Target language",
- "default": "English",
- "enum": [
- "Albanian",
- "Arabic",
- "Azerbaijani",
- "Bengali",
- "Bulgarian",
- "Chinese",
- "Croatian",
- "Czech",
- "Danish",
- "Dutch",
- "English",
- "Estonian",
- "Finnish",
- "French",
- "Georgian",
- "German",
- "Greek",
- "Hebrew",
- "Hindi",
- "Hungarian",
- "Icelandic",
- "Indonesian",
- "Italian",
- "Japanese",
- "Kazakh",
- "Korean",
- "Latvian",
- "Lithuanian",
- "Malay",
- "Malayalam",
- "Mongolian",
- "Norwegian",
- "Northern_sami",
- "Pashto",
- "Persian",
- "Polish",
- "Portuguese",
- "Romanian",
- "Russian",
- "Serbian",
- "Slovak",
- "Slovenian",
- "Spanish",
- "Swedish",
- "Tagalog",
- "Tamil",
- "Thai",
- "Turkish",
- "Ukrainian",
- "Urdu",
- "Vietnamese"
- ],
- "description": "The language to translate to"
- },
- "useCache": {
- "type": "boolean",
- "title": "Cache and use cached results",
- "default": true,
- "description": "Use cached results"
- }
- }
-}
diff --git a/tools/translate/src/constants.py b/tools/translate/src/constants.py
deleted file mode 100644
index d342a2ee8..000000000
--- a/tools/translate/src/constants.py
+++ /dev/null
@@ -1,72 +0,0 @@
-class EnvKey:
- GOOGLE_SERVICE_ACCOUNT = "GOOGLE_SERVICE_ACCOUNT"
-
-
-class GoogleTranslateKey:
- PROCESSOR = "Google Translate"
- CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
-
-
-class StaticData:
- LANGUAGE_CODES = {
- "english": "en",
- "chinese": "zh",
- "spanish": "es",
- "arabic": "ar",
- "portuguese": "pt",
- "russian": "ru",
- "japanese": "ja",
- "german": "de",
- "french": "fr",
- "korean": "ko",
- "turkish": "tr",
- "italian": "it",
- "polish": "pl",
- "dutch": "nl",
- "swedish": "sv",
- "indonesian": "id",
- "danish": "da",
- "norwegian": "no",
- "finnish": "fi",
- "greek": "el",
- "hebrew": "he",
- "hungarian": "hu",
- "czech": "cs",
- "thai": "th",
- "vietnamese": "vi",
- "hindi": "hi",
- "ukrainian": "uk",
- "malay": "ms",
- "malayalam": "ml",
- "romanian": "ro",
- "northern_sami": "se",
- "slovak": "sk",
- "bulgarian": "bg",
- "croatian": "hr",
- "serbian": "sr",
- "bengali": "bn",
- "tamil": "ta",
- "persian": "fa",
- "slovenian": "sl",
- "lithuanian": "lt",
- "latvian": "lv",
- "estonian": "et",
- "icelandic": "is",
- "georgian": "ka",
- "albanian": "sq",
- "tagalog": "tl",
- "mongolian": "mn",
- "azerbaijani": "az",
- "kazakh": "kk",
- }
-
- ALLOWED_PROCESSORS = [
- GoogleTranslateKey.PROCESSOR,
- "Microsoft Translate",
- "Amazon Translate",
- "IBM Watson Translate",
- "DeepL Translate",
- "Zipstack/Unstract translate",
- ]
-
- SUPPORTED_PROCESSORS = [GoogleTranslateKey.PROCESSOR]
diff --git a/tools/translate/src/main.py b/tools/translate/src/main.py
deleted file mode 100644
index a2965eeba..000000000
--- a/tools/translate/src/main.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import io
-import json
-import sys
-from pathlib import Path
-from typing import Any
-
-from google.auth.transport import requests as google_requests
-from google.cloud import translate_v2 as translate
-from google.oauth2.service_account import Credentials
-from unstract.sdk.cache import ToolCache
-from unstract.sdk.constants import LogState, MetadataKey, ToolEnv
-from unstract.sdk.tool.base import BaseTool
-from unstract.sdk.tool.entrypoint import ToolEntrypoint
-from unstract.sdk.utils import ToolUtils
-from unstructured.partition.auto import partition
-
-from .constants import EnvKey, GoogleTranslateKey, StaticData
-
-
-class UnstractTranslate(BaseTool):
- def validate(self, input_file: str, settings: dict[str, Any]) -> None:
- target_language = settings["targetLanguage"].lower()
- processor = settings["processor"]
-
- if target_language not in StaticData.LANGUAGE_CODES:
- self.stream_error_and_exit(
- f"Target language not found: {target_language}"
- )
-
- if processor not in StaticData.SUPPORTED_PROCESSORS:
- self.stream_error_and_exit(
- f"Processor not supported yet: {processor}"
- )
-
- def run(
- self,
- settings: dict[str, Any],
- input_file: str,
- output_dir: str,
- ) -> None:
- language_codes = StaticData.LANGUAGE_CODES
- target_language = settings["targetLanguage"].lower()
- processor = settings["processor"]
- use_cache = settings["useCache"]
-
- self.stream_log("Reading file...")
- text = self._extract_text(input_file)
- self.stream_log(f"Text length: {len(text)}")
-
- # Update GUI
- input_text_for_log = text
- if len(input_text_for_log) > 500:
- input_text_for_log = input_text_for_log[:500] + "...(truncated)"
- input_log = (
- f"Target language: `{target_language}`\n\nInput text:\n\n"
- f"```text\n{input_text_for_log}\n```\n\n"
- )
- output_log = ""
- self.stream_update(input_log, state=LogState.INPUT_UPDATE)
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- cache_key = (
- f"cache:{self.workflow_id}:{processor}:"
- f"{language_codes[target_language]}:"
- f"{ToolUtils.hash_str(text)}"
- )
- translated_text = ""
- cost_value = 0.0
- cost_unit = ""
- cache = None
- is_cache_data_available = False
- if use_cache: # Get the data from cache
- self.stream_log("Trying to retrieve cached data")
- cache = ToolCache(
- tool=self,
- platform_host=self.get_env_or_die(ToolEnv.PLATFORM_HOST),
- platform_port=int(self.get_env_or_die(ToolEnv.PLATFORM_PORT)),
- )
- cached_response = cache.get(cache_key)
- if cached_response is not None:
- translated_text = cached_response
- cost_unit = "cache"
- is_cache_data_available = True
- else:
- self.stream_log("Cache data not available")
-
- # Follow the usual steps, since cache is disabled
- if not translated_text:
- if processor == GoogleTranslateKey.PROCESSOR:
- google_service_account: str = self.get_env_or_die(
- EnvKey.GOOGLE_SERVICE_ACCOUNT
- )
- credentials = Credentials.from_service_account_info(
- json.loads(google_service_account),
- scopes=GoogleTranslateKey.CREDENTIAL_SCOPES,
- )
- credentials.refresh(google_requests.Request())
- translate_client = translate.Client(credentials=credentials)
-
- # Text can also be a sequence of strings, in which case
- # this method will return a sequence of results for each text.
- self.stream_log("Sending text to Google Translate")
- result = translate_client.translate(
- text, target_language=language_codes[target_language]
- )
- self.stream_log("Received text from Google Translate")
-
- if result is not None and "translatedText" in result:
- translated_text = result["translatedText"]
- cost_value = len(text)
- cost_unit = "google_translate"
- else:
- self.stream_error_and_exit(
- f"Unsupported processor: {processor}"
- )
-
- if use_cache and cache is not None and not is_cache_data_available:
- cache.set(cache_key, translated_text)
-
- output_log = (
- f"### Translated text\n\n```text\n{translated_text}\n```\n\n"
- )
- self.stream_update(output_log, state=LogState.OUTPUT_UPDATE)
-
- # Write the translated text to output file
- try:
- self.stream_log("Writing tool output")
- source_name = self.get_exec_metadata.get(MetadataKey.SOURCE_NAME)
- output_path = Path(output_dir) / f"{Path(source_name).stem}.txt"
- with open(output_path, "w", encoding="utf-8") as f:
- f.write(translated_text)
- except Exception as e:
- self.stream_error_and_exit(f"Error creating output file: {e}")
-
- self.stream_cost(cost_value, cost_unit)
- self.write_tool_result(data=translated_text)
-
- def _extract_text(self, file: str) -> str:
- """Extract text from file.
-
- Args:
- file (str): The path to the input file
-
- Returns:
- str: page content
- """
- try:
- with open(file, mode="rb") as input_file_obj:
- bytes_io = io.BytesIO(input_file_obj.read())
- elements = partition(file=bytes_io)
- except Exception as e:
- self.stream_error_and_exit(f"Error partitioning file: {e}")
- text = "\n\n".join([str(el) for el in elements])
- return text
-
-
-if __name__ == "__main__":
- args = sys.argv[1:]
- tool = UnstractTranslate.from_tool_args(args=args)
- ToolEntrypoint.launch(tool=tool, args=args)