Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Remote File Storage for Prompt Studio #827

Merged
merged 45 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
9374a79
Add more tests
gaya3-zipstack Oct 22, 2024
5a2dbd0
Commit pdm.lock changes
gaya3-zipstack Oct 22, 2024
3a1b94b
Check in with improvements
gaya3-zipstack Oct 23, 2024
945b215
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 24, 2024
a90e2ed
Commit pdm.lock changes
gaya3-zipstack Oct 24, 2024
83e32b6
Add permanent storage here
gaya3-zipstack Oct 24, 2024
ffdaf65
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 24, 2024
f6e2bb8
Commit pdm.lock changes
gaya3-zipstack Oct 24, 2024
c64cb0a
Add skeleton for temporary fs
gaya3-zipstack Oct 25, 2024
d070bb1
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 25, 2024
107476b
Initial commit for Prompt studio file storage changes
harini-venkataraman Oct 29, 2024
d4ee2a8
Prompt studio changes
harini-venkataraman Nov 5, 2024
d146896
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 12, 2024
7381cf2
Commit pdm.lock changes
kirtimanmishrazipstack Nov 12, 2024
eb0e1aa
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 13, 2024
6eb9ef0
Commit pdm.lock changes
kirtimanmishrazipstack Nov 13, 2024
e852d70
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 19, 2024
1007977
Commit pdm.lock changes
kirtimanmishrazipstack Nov 19, 2024
ebf5412
Handling legacy file path
harini-venkataraman Nov 20, 2024
b068cea
Remove redundant tests
harini-venkataraman Nov 20, 2024
7af6b78
Remove redundant tests
harini-venkataraman Nov 20, 2024
c4c4afa
PDM Lock files
harini-venkataraman Nov 20, 2024
984411d
Encoding fixes for prompt studio k8 changes
harini-venkataraman Nov 26, 2024
6aee781
Reverting changes for v2
harini-venkataraman Nov 26, 2024
fe493cd
Merge branch 'main' into feat/remote-storage-prompt-studio
harini-venkataraman Nov 26, 2024
96d331c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
8bf1357
Commit pdm.lock changes
pre-commit-ci[bot] Nov 26, 2024
2590ac1
Revert unused utils
harini-venkataraman Nov 26, 2024
b19fc6d
Bump version to use rc2
harini-venkataraman Nov 26, 2024
be05eb6
Change to legacy envs
harini-venkataraman Nov 26, 2024
15aa1f9
Change to legacy envs
harini-venkataraman Nov 26, 2024
2b716cc
Remove unused exceptions
harini-venkataraman Nov 26, 2024
dd2c671
Revert unused changes
harini-venkataraman Nov 26, 2024
76f4d21
Enable feature flag for indexing
harini-venkataraman Nov 26, 2024
59479a1
Commit pdm.lock changes
harini-venkataraman Nov 26, 2024
a8341bf
Adding feature flags to dynamic indexer
harini-venkataraman Nov 26, 2024
c9c83db
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
e9b7111
Addition of doc strings
harini-venkataraman Nov 26, 2024
8ad905c
Changing imports for Feature Flags
harini-venkataraman Nov 26, 2024
eb412f4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
e12caae
Reverting SDK to rc1
harini-venkataraman Dec 2, 2024
85a2df6
Commit pdm.lock changes
harini-venkataraman Dec 2, 2024
f2f1620
Merge branch 'main' into feat/remote-storage-prompt-studio
gaya3-zipstack Dec 2, 2024
1e362d8
Roll back pdm lock file changes
harini-venkataraman Dec 2, 2024
9781f5e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/backend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ class FeatureFlag:
"""Temporary feature flags."""

APP_DEPLOYMENT = "app_deployment"
REMOTE_FILE_STORAGE = "remote_file_storage"
2 changes: 2 additions & 0 deletions backend/file_management/file_management_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import magic
from connector_v2.models import ConnectorInstance
from deprecated import deprecated
from django.conf import settings
from django.http import StreamingHttpResponse
from file_management.exceptions import (
Expand Down Expand Up @@ -141,6 +142,7 @@ def upload_file(
remote_file.write(file.read())

@staticmethod
@deprecated(reason="Use remote FS APIs from SDK")
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
def fetch_file_contents(file_system: UnstractFileSystem, file_path: str) -> Any:
fs = file_system.get_fsspec_fs()
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from adapter_processor.models import AdapterInstance
from django.conf import settings
from django.db.models.manager import BaseManager
from file_management.file_management_helper import FileManagerHelper
from prompt_studio.modifier_loader import ModifierConfig
from prompt_studio.modifier_loader import load_plugins as load_modifier_plugins
from prompt_studio.prompt_profile_manager.models import ProfileManager
Expand Down Expand Up @@ -51,6 +50,7 @@
from unstract.sdk.index import Index
from unstract.sdk.prompt import PromptTool
from unstract.sdk.utils.tool_utils import ToolUtils
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.local_context import StateStore

from unstract.core.pubsub_helper import LogPublisher
Expand Down Expand Up @@ -331,7 +331,7 @@ def index_document(
file_path = file_name
else:
default_profile = ProfileManager.get_default_llm_profile(tool)
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
file_path = PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id,
is_create=False,
user_id=user_id,
Expand Down Expand Up @@ -609,7 +609,7 @@ def _execute_prompts_in_single_pass(

@staticmethod
def _get_document_path(org_id, user_id, tool_id, doc_name):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
doc_path = PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
Expand All @@ -621,7 +621,7 @@ def _get_document_path(org_id, user_id, tool_id, doc_name):
def _get_extract_or_summary_document_path(
org_id, user_id, tool_id, doc_name, doc_type
) -> str:
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
doc_path = PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
Expand Down
70 changes: 37 additions & 33 deletions backend/prompt_studio/prompt_studio_core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@
from rest_framework.versioning import URLPathVersioning
from tool_instance.models import ToolInstance
from unstract.sdk.utils.common_utils import CommonUtils
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.user_session import UserSessionUtils

from backend.constants import FeatureFlag
from unstract.connectors.filesystems.local_storage.local_storage import LocalStorageFS
from unstract.flags.feature_flag import check_feature_flag_status

from .models import CustomTool
from .serializers import (
Expand Down Expand Up @@ -403,31 +406,41 @@ def fetch_contents_ide(self, request: HttpRequest, pk: Any = None) -> Response:
f"{FileViewTypes.SUMMARIZE.lower()}/"
f"{filename_without_extension}.txt"
)

file_path = file_path = FileManagerHelper.handle_sub_directory_for_tenants(
UserSessionUtils.get_organization_id(request),
is_create=True,
user_id=custom_tool.created_by.user_id,
tool_id=str(custom_tool.tool_id),
)
file_system = LocalStorageFS(settings={"path": file_path})
if not file_path.endswith("/"):
file_path += "/"
file_path += file_name
# Temporary Hack for frictionless onboarding as the user id will be empty
try:
contents = FileManagerHelper.fetch_file_contents(file_system, file_path)
except FileNotFound:
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_path = file_path = FileManagerHelper.handle_sub_directory_for_tenants(
UserSessionUtils.get_organization_id(request),
is_create=True,
user_id="",
user_id=custom_tool.created_by.user_id,
tool_id=str(custom_tool.tool_id),
)
file_system = LocalStorageFS(settings={"path": file_path})
if not file_path.endswith("/"):
file_path += "/"
file_path += file_name
contents = FileManagerHelper.fetch_file_contents(file_system, file_path)
file_path += file_name
# TODO : Handle this with proper fix
# Temporary Hack for frictionless onboarding as the user id will be empty
try:
contents = FileManagerHelper.fetch_file_contents(file_system, file_path)
except FileNotFound:
file_path = file_path = (
FileManagerHelper.handle_sub_directory_for_tenants(
UserSessionUtils.get_organization_id(request),
is_create=True,
user_id="",
tool_id=str(custom_tool.tool_id),
)
)
if not file_path.endswith("/"):
file_path += "/"
file_path += file_name
contents = FileManagerHelper.fetch_file_contents(file_system, file_path)
else:
PromptStudioFileHelper.fetch_file_contents(
file_name=file_name,
org_id=UserSessionUtils.get_organization_id(request),
user_id=custom_tool.created_by.user_id,
tool_id=str(custom_tool.tool_id),
)

return Response({"data": contents}, status=status.HTTP_200_OK)

Expand All @@ -437,15 +450,6 @@ def upload_for_ide(self, request: HttpRequest, pk: Any = None) -> Response:
serializer = FileUploadIdeSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
uploaded_files: Any = serializer.validated_data.get("file")

file_path = FileManagerHelper.handle_sub_directory_for_tenants(
UserSessionUtils.get_organization_id(request),
is_create=True,
user_id=custom_tool.created_by.user_id,
tool_id=str(custom_tool.tool_id),
)
file_system = LocalStorageFS(settings={"path": file_path})

documents = []
for uploaded_file in uploaded_files:
file_name = uploaded_file.name
Expand All @@ -464,11 +468,11 @@ def upload_for_ide(self, request: HttpRequest, pk: Any = None) -> Response:
logger.info(
f"Uploading file: {file_name}" if file_name else "Uploading file"
)
FileManagerHelper.upload_file(
file_system,
file_path,
uploaded_file,
file_name,
PromptStudioFileHelper.upload_for_ide(
org_id=UserSessionUtils.get_organization_id(request),
user_id=custom_tool.created_by.user_id,
tool_id=str(custom_tool.tool_id),
uploaded_file=uploaded_file,
)
documents.append(doc)
return Response({"data": documents})
Expand All @@ -485,7 +489,7 @@ def delete_for_ide(self, request: HttpRequest, pk: uuid) -> Response:
user_id = custom_tool.created_by.user_id
document: DocumentManager = DocumentManager.objects.get(pk=document_id)
file_name: str = document.document_name
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
file_path = PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
is_create=False,
user_id=user_id,
Expand Down
17 changes: 17 additions & 0 deletions backend/utils/file_storage/common_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os


class FileStorageUtil:
@staticmethod
def get_env_or_die(env_key: str) -> str:
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
"""Returns the value of an env variable.
If its empty or None, raises an error and exits
Args:
env_key (str): Key to retrieve
Returns:
str: Value of the env
"""
env_value = os.environ.get(env_key)
if env_value is None or env_value == "":
raise ValueError(f"Env variable '{env_key}' is required")
return env_value
15 changes: 15 additions & 0 deletions backend/utils/file_storage/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from enum import Enum


class FileStorageKeys:
FILE_STORAGE_PROVIDER = "FILE_STORAGE_PROVIDER"
FILE_STORAGE_CREDENTIALS = "FILE_STORAGE_CREDENTIALS"


class FileStorageType(Enum):
PERMANENT = "permanent"
TEMPORARY = "temporary"


class FileStorageConstants:
PROMPT_STUDIO_FILE_PATH = "PROMPT_STUDIO_FILE_PATH"
6 changes: 6 additions & 0 deletions backend/utils/file_storage/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from rest_framework.exceptions import APIException


class OrgIdNotValid(APIException):
status_code = 400
default_detail = "Organization ID is not valid"
42 changes: 42 additions & 0 deletions backend/utils/file_storage/helpers/common_file_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any

from unstract.sdk.file_storage import FileStorageProvider
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
from unstract.sdk.file_storage.fs_impl import FileStorage
from unstract.sdk.file_storage.fs_permanent import PermanentFileStorage
from unstract.sdk.file_storage.fs_shared_temporary import SharedTemporaryFileStorage
from utils.file_storage.common_utils import FileStorageUtil
from utils.file_storage.constants import FileStorageKeys, FileStorageType


class FileStorageHelper:
# TODO : Optimize this to a singleton class
@staticmethod
def initialize_file_storage(type: FileStorageType) -> FileStorage:
provider_data = FileStorageHelper.load_file_storage_envs()
provider = provider_data[FileStorageKeys.FILE_STORAGE_PROVIDER]
credentials = provider_data[FileStorageKeys.FILE_STORAGE_CREDENTIALS]
if type.value == FileStorageType.PERMANENT.value:
file_storage = PermanentFileStorage(
provider=provider, credentials=credentials
)
if type.value == FileStorageType.TEMPORARY.value:
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
kirtimanmishrazipstack marked this conversation as resolved.
Show resolved Hide resolved
file_storage = SharedTemporaryFileStorage(
provider=provider, credentials=credentials
)
file_storage = FileStorage(
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
kirtimanmishrazipstack marked this conversation as resolved.
Show resolved Hide resolved
provider=FileStorageProvider.Local, credentials=credentials
)
return file_storage

@staticmethod
def load_file_storage_envs() -> dict[str, Any]:
provider: str = FileStorageUtil.get_env_or_die(
env_key=FileStorageKeys.FILE_STORAGE_PROVIDER
)
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
credentials: str = FileStorageUtil.get_env_or_die(
env_key=FileStorageKeys.FILE_STORAGE_CREDENTIALS
)
provider_data: dict[str, Any] = {}
provider_data[FileStorageKeys.FILE_STORAGE_PROVIDER] = provider
provider_data[FileStorageKeys.FILE_STORAGE_CREDENTIALS] = credentials
return provider_data
96 changes: 96 additions & 0 deletions backend/utils/file_storage/helpers/prompt_studio_file_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from typing import Any, Union

from file_management.exceptions import OrgIdNotValid
from utils.file_storage.common_utils import FileStorageUtil
from utils.file_storage.constants import FileStorageConstants, FileStorageType
from utils.file_storage.helpers.common_file_helper import FileStorageHelper


class PromptStudioFileHelper:
@staticmethod
def handle_sub_directory_for_prompt_studio(
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
org_id: str, user_id: str, tool_id: str, is_create: bool
) -> str:
"""Resolves a directory path meant for a user running prompt studio.

Args:
org_id (str): Organization ID
user_id (str): User ID
tool_id (str): ID of the prompt studio tool
is_create (bool): Flag to create the directory

Returns:
str: The absolute path to the directory meant for prompt studio
"""
if not org_id:
raise OrgIdNotValid()
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
base_path = FileStorageUtil.get_env_or_die(
env_key=FileStorageConstants.PROMPT_STUDIO_FILE_PATH
)
file_path = f"{base_path}/{org_id}/{user_id}/{tool_id}"
extract_file_path = f"{file_path}/extract"
summarize_file_path = f"{file_path}/summarize"
if is_create:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
fs_instance.mkdir(file_path, create_parents=True)
fs_instance.mkdir(extract_file_path, create_parents=True)
fs_instance.mkdir(summarize_file_path, create_parents=True)
return str(file_path)

@staticmethod
def upload_for_ide(
org_id: str, user_id: str, tool_id: str, uploaded_file: Any
) -> None:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
file_system_path = (
PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
is_create=True,
user_id=user_id,
tool_id=str(tool_id),
)
)
file_path = f"{file_system_path}/{uploaded_file.name}"
fs_instance.write(path=file_path, mode="wb", data=uploaded_file.read())

@staticmethod
def fetch_file_contents(
org_id: str, user_id: str, tool_id: str, file_name: str
) -> Union[bytes, str]:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
file_system_path = (
PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
is_create=True,
user_id=user_id,
tool_id=str(tool_id),
)
)
# TODO : Handle this with proper fix
# Temporary Hack for frictionless onboarding as the user id will be empty
if not fs_instance.exists(file_system_path):
file_system_path = (
PromptStudioFileHelper.handle_sub_directory_for_prompt_studio(
org_id=org_id,
is_create=True,
user_id="",
tool_id=str(tool_id),
)
)
file_path = f"{file_system_path}/{file_name}"
file_content_type = fs_instance.mime_type(file_path)
text_content: Union[bytes, str]
if file_content_type == "application/pdf":
# Read contents of PDF file into a string
text_content = fs_instance.read(path=file_path, mode="rb")

elif file_content_type == "text/plain":
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
text_content = fs_instance.read(path=file_path, mode="r")

return text_content
Loading
Loading