Skip to content

Commit

Permalink
SDK & adapters merge (#73)
Browse files Browse the repository at this point in the history
* SDK & adapters merge - First cut

* Refactor unstract.sdk.core.* back to unstract.sdk.*

* Pick up latest changes

* Remove LICENSE

* Refactoring adapters.py to adapter.py to resolve name conflict

* Remove dependency on adapters

* Update lock file

* Fix regex pattern

* pin llama-index-core version

---------

Signed-off-by: Gayathri <[email protected]>
  • Loading branch information
gaya3-zipstack authored Jul 25, 2024
1 parent 623807c commit a424e5b
Show file tree
Hide file tree
Showing 179 changed files with 7,009 additions and 390 deletions.
792 changes: 461 additions & 331 deletions pdm.lock

Large diffs are not rendered by default.

43 changes: 39 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,48 @@ dependencies = [
"jsonschema~=4.18.2",
"python-magic~=0.4.27",
"python-dotenv==1.0.0",
# LLM Triad
"unstract-adapters~=0.22.0",
# Adapter changes
"llama-index==0.10.38",
# Temporary hack to get out of llama-index issues
# To be removed once llama-index version is upgraded
"llama-index-core==0.10.56",
"tiktoken~=0.4.0",
"transformers==4.37.0",
# Error handling, remove after moving it to adapters
"openai~=1.21.2"
"llama-index-embeddings-google==0.1.5",
"llama-index-embeddings-azure-openai==0.1.6",
# Disabling Hugging Face & FastEmbed to
# keep the image size under check
# "llama-index-embeddings-huggingface==0.2.0",
# Disabling fast embed due to high processing power
# "llama-index-embeddings-fastembed==0.1.4",
"llama-index-embeddings-openai==0.1.11",
"llama-index-embeddings-azure-openai==0.1.6",
"llama-index-embeddings-ollama==0.1.2",
"llama-index-vector-stores-postgres==0.1.3",
# Including Supabase conflicts with postgres on pg-vector.
# Hence, commenting it out at the moment
# "llama-index-vector-stores-supabase==0.1.3",
"llama-index-vector-stores-milvus==0.1.18",
"llama-index-vector-stores-weaviate==0.1.4",
"llama-index-vector-stores-pinecone==0.1.4",
"llama-index-vector-stores-qdrant==0.2.8",
"llama-index-llms-openai==0.1.26",
"llama-index-llms-palm==0.1.5",
"llama-index-llms-mistralai==0.1.10",
"llama-index-llms-anyscale==0.1.3",
"llama-index-llms-anthropic==0.1.11",
"llama-index-llms-azure-openai==0.1.5",
"llama-index-llms-vertex==0.1.8",
"llama-index-llms-replicate==0.1.3",
"llama-index-llms-ollama==0.1.3",
# OCR
"filetype~=1.2.0",
# Others
# For singleton classes
"singleton-decorator~=1.0.0",
# For Llama Parse X2Text
"llama-parse==0.4.1",
"httpx>=0.25.2",
]
readme = "README.md"
urls = { Homepage = "https://unstract.com", "Release notes" = "https://github.com/Zipstack/unstract-sdk/releases", Source = "https://github.com/Zipstack/unstract-sdk" }
Expand Down
21 changes: 0 additions & 21 deletions src/unstract/sdk/LICENSE

This file was deleted.

3 changes: 2 additions & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__version__ = "0.38.1"
__version__ = "0.39.0"



def get_sdk_version():
Expand Down
6 changes: 2 additions & 4 deletions src/unstract/sdk/adapters.py → src/unstract/sdk/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ def get_adapter_configuration(

elif response.status_code == 404:
self.tool.stream_log(
f"adapter not found for: for adapter instance"
f"{adapter_instance_id}",
f"adapter not found for: for adapter instance" f"{adapter_instance_id}",
level=LogLevel.ERROR,
)
return None
Expand Down Expand Up @@ -127,7 +126,6 @@ def get_adapter_config(
] = tool_adapter.get_adapter_configuration(adapter_instance_id)
if not adapter_metadata:
tool.stream_error_and_exit(
f"Adapter not found for "
f"adapter instance: {adapter_instance_id}"
f"Adapter not found for " f"adapter instance: {adapter_instance_id}"
)
return adapter_metadata
8 changes: 8 additions & 0 deletions src/unstract/sdk/adapters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import logging
from logging import NullHandler
from typing import Any

logging.getLogger(__name__).addHandler(NullHandler())

AdapterDict = dict[str, dict[str, Any]]

86 changes: 86 additions & 0 deletions src/unstract/sdk/adapters/adapterkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import logging
from typing import Any

from singleton_decorator import singleton

from unstract.sdk.adapters import AdapterDict
from unstract.sdk.adapters.base import Adapter
from unstract.sdk.adapters.constants import Common
from unstract.sdk.adapters.embedding import adapters as embedding_adapters
from unstract.sdk.adapters.llm import adapters as llm_adapters
from unstract.sdk.adapters.ocr import adapters as ocr_adapters
from unstract.sdk.adapters.vectordb import adapters as vectordb_adapters
from unstract.sdk.adapters.x2text import adapters as x2text_adapters

logger = logging.getLogger(__name__)


# Declaring this class as a Singleton to avoid initialising
# adapters list everytime
@singleton
class Adapterkit:
def __init__(self) -> None:
self._adapters: AdapterDict = (
embedding_adapters
| llm_adapters
| vectordb_adapters
| x2text_adapters
| ocr_adapters
)

@property
def adapters(self) -> AdapterDict:
return self._adapters

def get_adapter_class_by_adapter_id(self, adapter_id: str) -> Adapter:
if adapter_id in self._adapters:
adapter_class: Adapter = self._adapters[adapter_id][
Common.METADATA
][Common.ADAPTER]
return adapter_class
else:
raise RuntimeError(f"Couldn't obtain adapter for {adapter_id}")

def get_adapter_by_id(
self, adapter_id: str, *args: Any, **kwargs: Any
) -> Adapter:
"""Instantiates and returns a adapter.
Args:
adapter_id (str): Identifies adapter to create
Raises:
RuntimeError: If the ID is invalid/adapter is missing
Returns:
Adapter: Concrete impl of the `Adapter` base
"""
adapter_class: Adapter = self.get_adapter_class_by_adapter_id(
adapter_id
)
return adapter_class(*args, **kwargs)

def get_adapters_list(self) -> list[dict[str, Any]]:
adapters = []
for adapter_id, adapter_registry_metadata in self._adapters.items():
m: Adapter = adapter_registry_metadata[Common.METADATA][
Common.ADAPTER
]
_id = m.get_id()
name = m.get_name()
adapter_type = m.get_adapter_type().name
json_schema = m.get_json_schema()
desc = m.get_description()
icon = m.get_icon()
adapters.append(
{
"id": _id,
"name": name,
"class_name": m.__name__,
"description": desc,
"icon": icon,
"adapter_type": adapter_type,
"json_schema": json_schema,
}
)
return adapters
50 changes: 50 additions & 0 deletions src/unstract/sdk/adapters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
from abc import ABC, abstractmethod

from unstract.sdk.adapters.enums import AdapterTypes

logger = logging.getLogger(__name__)


class Adapter(ABC):
def __init__(self, name: str):
self.name = name

@staticmethod
@abstractmethod
def get_id() -> str:
return ""

@staticmethod
@abstractmethod
def get_name() -> str:
return ""

@staticmethod
@abstractmethod
def get_description() -> str:
return ""

@staticmethod
@abstractmethod
def get_icon() -> str:
return ""

@staticmethod
@abstractmethod
def get_json_schema() -> str:
return ""

@staticmethod
@abstractmethod
def get_adapter_type() -> AdapterTypes:
return ""

@abstractmethod
def test_connection(self) -> bool:
"""Override to test connection for a adapter.
Returns:
bool: Flag indicating if the credentials are valid or not
"""
pass
10 changes: 10 additions & 0 deletions src/unstract/sdk/adapters/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class Common:
METADATA = "metadata"
MODULE = "module"
ADAPTER = "adapter"
SRC_FOLDER = "src"
ADAPTER_METADATA = "adapter_metadata"
ICON = "icon"
ADAPTER_ID = "adapter_id"
ADAPTER_TYPE = "adapter_type"
DEFAULT_ERR_MESSAGE = "Something went wrong"
5 changes: 5 additions & 0 deletions src/unstract/sdk/adapters/embedding/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from unstract.sdk.adapters import AdapterDict
from unstract.sdk.adapters.embedding.register import EmbeddingRegistry

adapters: AdapterDict = {}
EmbeddingRegistry.register_adapters(adapters)
4 changes: 4 additions & 0 deletions src/unstract/sdk/adapters/embedding/azure_open_ai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Unstract Azure Open AI Embedding Adapter

This package consists of the functionalities required to adapt with Azure OpenAI Embedding
Version supported
25 changes: 25 additions & 0 deletions src/unstract/sdk/adapters/embedding/azure_open_ai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"


[project]
name = "unstract-azure-open-ai-embedding"
version = "0.0.1"
description = "Azure Open AI Embedding"
authors = [
{name = "Zipstack Inc.", email = "[email protected]"},
]
dependencies = [
]
requires-python = ">=3.9"
readme = "README.md"
classifiers = [
"Programming Language :: Python"
]
license = {text = "MIT"}

[tool.pdm.build]
includes = ["src"]
package-dir = "src"
# source-includes = ["tests"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .azure_open_ai import AzureOpenAI

metadata = {
"name": AzureOpenAI.__name__,
"version": "1.0.0",
"adapter": AzureOpenAI,
"description": "AzureOpenAI embedding adapter",
"is_active": True,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
from typing import Any

from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

from unstract.sdk.adapters.embedding.embedding_adapter import EmbeddingAdapter
from unstract.sdk.adapters.embedding.helper import EmbeddingHelper
from unstract.sdk.adapters.exceptions import AdapterError


class Constants:
ADAPTER_NAME = "adapter_name"
MODEL = "model"
API_KEY = "api_key"
API_VERSION = "api_version"
AZURE_ENDPOINT = "azure_endpoint"
DEPLOYMENT_NAME = "deployment_name"
API_TYPE = "azure"


class AzureOpenAI(EmbeddingAdapter):
def __init__(self, settings: dict[str, Any]):
super().__init__("AzureOpenAIEmbedding")
self.config = settings

@staticmethod
def get_id() -> str:
return "azureopenai|9770f3f6-f8ba-4fa0-bb3a-bef48a00e66f"

@staticmethod
def get_name() -> str:
return "AzureOpenAIEmbedding"

@staticmethod
def get_description() -> str:
return "AzureOpenAI Embedding"

@staticmethod
def get_provider() -> str:
return "azure"

@staticmethod
def get_icon() -> str:
return "/icons/adapter-icons/AzureopenAI.png"

@staticmethod
def get_json_schema() -> str:
f = open(f"{os.path.dirname(__file__)}/static/json_schema.json")
schema = f.read()
f.close()
return schema

def get_embedding_instance(self) -> BaseEmbedding:
try:
embedding_batch_size = EmbeddingHelper.get_embedding_batch_size(
config=self.config
)
embedding: BaseEmbedding = AzureOpenAIEmbedding(
model=str(self.config.get(Constants.MODEL)),
deployment_name=str(self.config.get(Constants.DEPLOYMENT_NAME)),
api_key=str(self.config.get(Constants.API_KEY)),
api_version=str(self.config.get(Constants.API_VERSION)),
azure_endpoint=str(self.config.get(Constants.AZURE_ENDPOINT)),
embed_batch_size=embedding_batch_size,
api_type=Constants.API_TYPE,
)
return embedding
except Exception as e:
raise AdapterError(str(e))

def test_connection(self) -> bool:
embedding = self.get_embedding_instance()
test_result: bool = EmbeddingHelper.test_embedding_instance(embedding)
return test_result
Loading

0 comments on commit a424e5b

Please sign in to comment.