feat: JSON schema changes to support URLs and description boxes (#133)

JSON schema changes to support URLs and description boxes Signed-off-by: Chandrasekharan M <[email protected]>
Zipstack · Dec 4, 2024 · e2cf928 · e2cf928
1 parent 6fda9d3
commit e2cf928
Show file tree

Hide file tree

Showing 14 changed files with 27 additions and 23 deletions.
diff --git a/src/unstract/sdk/adapters/embedding/no_op/src/static/json_schema.json b/src/unstract/sdk/adapters/embedding/no_op/src/static/json_schema.json
@@ -1,10 +1,11 @@
 {
-  "title": "No-op",
+  "title": "No Op Embedding",
   "type": "object",
   "required": [
     "adapter_name",
     "wait_time"
   ],
+  "description": "No Op Embedding does not perform any operation, its used to test the performance of the system in the absence of 3rd party induced latencies",
   "properties": {
     "adapter_name": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/embedding/ollama/src/static/json_schema.json b/src/unstract/sdk/adapters/embedding/ollama/src/static/json_schema.json
@@ -23,7 +23,7 @@
       "type": "string",
       "title": "Base URL",
       "default": "",
-      "description": "Provide the base URL where Ollama server is running. Example: http://docker.host.internal:11434 or http://localhost:11434"
+      "description": "Provide the base URL where Ollama server is running. Example: `http://docker.host.internal:11434` or `http://localhost:11434`"
     },
     "embed_batch_size": {
       "type": "number",

diff --git a/src/unstract/sdk/adapters/embedding/palm/src/static/json_schema.json b/src/unstract/sdk/adapters/embedding/palm/src/static/json_schema.json
@@ -17,7 +17,7 @@
       "type": "string",
       "title": "Model Name",
       "default": "models/embedding-gecko-001",
-      "description": "Provide the name of the model to use for embedding. Example: models/embedding-gecko-001"
+      "description": "Provide the name of the model to use for embedding. Example: `models/embedding-gecko-001`"
     },
     "api_key": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/embedding/qdrant_fast_embed/src/static/json_schema.json b/src/unstract/sdk/adapters/embedding/qdrant_fast_embed/src/static/json_schema.json
@@ -16,7 +16,7 @@
       "type": "string",
       "title": "Model",
       "default": "BAAI/bge-small-en-v1.5",
-      "description": "The name of the model to use. Example: BAAI/bge-small-en-v1.5"
+      "description": "The name of the model to use. Example: `BAAI/bge-small-en-v1.5`"
     }
   }
 }
diff --git a/src/unstract/sdk/adapters/llm/mistral/src/static/json_schema.json b/src/unstract/sdk/adapters/llm/mistral/src/static/json_schema.json
@@ -1,5 +1,5 @@
 {
-  "title": "MistralAI LLM",
+  "title": "Mistral AI LLM",
   "type": "object",
   "required": [
     "adapter_name",

diff --git a/src/unstract/sdk/adapters/llm/no_op/src/static/json_schema.json b/src/unstract/sdk/adapters/llm/no_op/src/static/json_schema.json
@@ -1,10 +1,11 @@
 {
-  "title": "No-op",
+  "title": "No Op LLM",
   "type": "object",
   "required": [
     "adapter_name",
     "wait_time"
   ],
+  "description": "No Op LLM does not perform any operation, its used to test the performance of the system in the absence of 3rd party induced latencies",
   "properties": {
     "adapter_name": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/vectordb/milvus/src/static/json_schema.json b/src/unstract/sdk/adapters/vectordb/milvus/src/static/json_schema.json
@@ -17,7 +17,7 @@
       "title": "URI",
       "format": "uri",
       "default": "localhost:19530",
-      "description": "Provide the URI of the Milvus server. Example: https://<instance-id>.api.gcp-us-west1.zillizcloud.com"
+      "description": "Provide the URI of the Milvus server. Example: `https://<instance-id>.api.gcp-us-west1.zillizcloud.com`"
     },
     "token": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/vectordb/no_op/src/static/json_schema.json b/src/unstract/sdk/adapters/vectordb/no_op/src/static/json_schema.json
@@ -1,10 +1,11 @@
 {
-  "title": "No-op",
+  "title": "No Op Vector DB",
   "type": "object",
   "required": [
     "adapter_name",
     "wait_time"
   ],
+  "description": "No Op Vector DB does not perform any operation, its used to test the performance of the system in the absence of 3rd party induced latencies",
   "properties": {
     "adapter_name": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/x2text/llama_parse/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llama_parse/src/static/json_schema.json
@@ -1,5 +1,5 @@
 {
-    "title": "Llama Parse X2Text",
+    "title": "Llama Parse Text Extractor",
     "type": "object",
     "required": [
       "api_key"
@@ -9,7 +9,7 @@
         "type": "string",
         "title": "Name",
         "default": "",
-        "description": "Provide a unique name for this adapter instance. Example: Llama parse1"
+        "description": "Provide a unique name for this adapter instance. Example: llama-parse-1"
       },
         "api_key": {
             "type": "string",
@@ -33,7 +33,7 @@
           "markdown"
         ],
         "default": "text",
-        "description": "Choose the type of result. Markdown or text."
+        "description": "Choose the type of result - `markdown` or `text`."
       },
       "verbose": {
         "type": "boolean",
@@ -43,4 +43,3 @@
       }
     }
   }
-
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
@@ -1,11 +1,12 @@
 {
-  "title": "LLMWhisperer X2Text",
+  "title": "LLMWhisperer v1 Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",
     "unstract_key",
     "url"
   ],
+  "description": "LLMWhisperer v1 is deprecated, use the cheaper and faster [LLMWhisperer v2](https://docs.unstract.com/llmwhisperer/llm_whisperer/faqs/v1_to_v2/) instead.",
   "properties": {
     "adapter_name": {
       "type": "string",
@@ -24,7 +25,7 @@
       "type": "string",
       "title": "Unstract Key",
       "format": "password",
-      "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)"
+      "description": "API key obtained from the [Unstract developer portal](https://unstract-api-resource.developer.azure-api.net)"
     },
     "mode": {
       "type": "string",
@@ -36,7 +37,7 @@
         "form"
       ],
       "default": "form",
-      "description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form:  Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button"
+      "description": "Processing mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#processing-modes)"
     },
     "output_mode": {
       "type": "string",
@@ -47,7 +48,7 @@
         "text"
       ],
       "default": "line-printer",
-      "description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode."
+      "description": "Output mode to use, described in the [LLMWhisperer v1 documentation](https://docs.unstract.com/llmwhisperer/1.0.0/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
     },
 
     "line_splitter_tolerance": {

diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
@@ -1,5 +1,5 @@
 {
-  "title": "LLMWhisperer X2Text v2",
+  "title": "LLMWhisperer v2 Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",
@@ -18,7 +18,7 @@
       "title": "URL",
       "format": "uri",
       "default": "https://llmwhisperer-api.us-central.unstract.com",
-      "description": "Provide the base URL of the LLM Whisperer service based on your region."
+      "description": "Provide the base URL of the LLMWhisperer service based on your region, can be obtained from the [Unstract developer portal](https://us-central.unstract.com/landing?selectedProduct=llm-whisperer)."
     },
     "unstract_key": {
       "type": "string",
@@ -36,7 +36,7 @@
         "form"
       ],
       "default": "form",
-      "description": "Processing mode to use, described in the [LLM Whisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)."
+      "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)."
     },
     "output_mode": {
       "type": "string",
@@ -46,7 +46,7 @@
         "text"
       ],
       "default": "layout_preserving",
-      "description": "Output format, described in the [LLM Whisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
+      "description": "Output format, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#output-modes)"
     },
     "line_splitter_tolerance": {
       "type": "number",

diff --git a/src/unstract/sdk/adapters/x2text/no_op/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/no_op/src/static/json_schema.json
@@ -1,10 +1,11 @@
 {
-  "title": "No-op",
+  "title": "No Op Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",
     "wait_time"
   ],
+  "description": "No Op Text Extractor does not perform any operation, its used to test the performance of the system in the absence of 3rd party induced latencies",
   "properties": {
     "adapter_name": {
       "type": "string",

diff --git a/src/unstract/sdk/adapters/x2text/unstructured_community/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/unstructured_community/src/static/json_schema.json
@@ -1,5 +1,5 @@
 {
-  "title": "Unstructured IO Community X2Text",
+  "title": "Unstructured IO Community Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",

diff --git a/src/unstract/sdk/adapters/x2text/unstructured_enterprise/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/unstructured_enterprise/src/static/json_schema.json
@@ -1,5 +1,5 @@
 {
-  "title": "Unstructured IO Enterprise X2Text",
+  "title": "Unstructured IO Enterprise Text Extractor",
   "type": "object",
   "required": [
     "adapter_name",