Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Remote File Storage for Prompt Studio #827

Merged
merged 45 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
9374a79
Add more tests
gaya3-zipstack Oct 22, 2024
5a2dbd0
Commit pdm.lock changes
gaya3-zipstack Oct 22, 2024
3a1b94b
Check in with improvements
gaya3-zipstack Oct 23, 2024
945b215
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 24, 2024
a90e2ed
Commit pdm.lock changes
gaya3-zipstack Oct 24, 2024
83e32b6
Add permanent storage here
gaya3-zipstack Oct 24, 2024
ffdaf65
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 24, 2024
f6e2bb8
Commit pdm.lock changes
gaya3-zipstack Oct 24, 2024
c64cb0a
Add skeleton for temporary fs
gaya3-zipstack Oct 25, 2024
d070bb1
Merge branch 'feature/remote_storage' of https://github.com/Zipstack/…
gaya3-zipstack Oct 25, 2024
107476b
Initial commit for Prompt studio file storage changes
harini-venkataraman Oct 29, 2024
d4ee2a8
Prompt studio changes
harini-venkataraman Nov 5, 2024
d146896
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 12, 2024
7381cf2
Commit pdm.lock changes
kirtimanmishrazipstack Nov 12, 2024
eb0e1aa
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 13, 2024
6eb9ef0
Commit pdm.lock changes
kirtimanmishrazipstack Nov 13, 2024
e852d70
Merge branch 'main' into feat/remote-storage-prompt-studio
kirtimanmishrazipstack Nov 19, 2024
1007977
Commit pdm.lock changes
kirtimanmishrazipstack Nov 19, 2024
ebf5412
Handling legacy file path
harini-venkataraman Nov 20, 2024
b068cea
Remove redundant tests
harini-venkataraman Nov 20, 2024
7af6b78
Remove redundant tests
harini-venkataraman Nov 20, 2024
c4c4afa
PDM Lock files
harini-venkataraman Nov 20, 2024
984411d
Encoding fixes for prompt studio k8 changes
harini-venkataraman Nov 26, 2024
6aee781
Reverting changes for v2
harini-venkataraman Nov 26, 2024
fe493cd
Merge branch 'main' into feat/remote-storage-prompt-studio
harini-venkataraman Nov 26, 2024
96d331c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
8bf1357
Commit pdm.lock changes
pre-commit-ci[bot] Nov 26, 2024
2590ac1
Revert unused utils
harini-venkataraman Nov 26, 2024
b19fc6d
Bump version to use rc2
harini-venkataraman Nov 26, 2024
be05eb6
Change to legacy envs
harini-venkataraman Nov 26, 2024
15aa1f9
Change to legacy envs
harini-venkataraman Nov 26, 2024
2b716cc
Remove unused exceptions
harini-venkataraman Nov 26, 2024
dd2c671
Revert unused changes
harini-venkataraman Nov 26, 2024
76f4d21
Enable feature flag for indexing
harini-venkataraman Nov 26, 2024
59479a1
Commit pdm.lock changes
harini-venkataraman Nov 26, 2024
a8341bf
Adding feature flags to dynamic indexer
harini-venkataraman Nov 26, 2024
c9c83db
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
e9b7111
Addition of doc strings
harini-venkataraman Nov 26, 2024
8ad905c
Changing imports for Feature Flags
harini-venkataraman Nov 26, 2024
eb412f4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
e12caae
Reverting SDK to rc1
harini-venkataraman Dec 2, 2024
85a2df6
Commit pdm.lock changes
harini-venkataraman Dec 2, 2024
f2f1620
Merge branch 'main' into feat/remote-storage-prompt-studio
gaya3-zipstack Dec 2, 2024
1e362d8
Roll back pdm lock file changes
harini-venkataraman Dec 2, 2024
9781f5e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/file_management/file_management_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import magic
from connector_v2.models import ConnectorInstance
from deprecated import deprecated
from django.conf import settings
from django.http import StreamingHttpResponse
from file_management.exceptions import (
Expand Down Expand Up @@ -141,6 +142,7 @@ def upload_file(
remote_file.write(file.read())

@staticmethod
@deprecated(reason="Use remote FS APIs from SDK")
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
def fetch_file_contents(file_system: UnstractFileSystem, file_path: str) -> Any:
fs = file_system.get_fsspec_fs()
try:
Expand Down
536 changes: 275 additions & 261 deletions backend/pdm.lock

Large diffs are not rendered by default.

47 changes: 35 additions & 12 deletions backend/prompt_studio/prompt_studio_core_v2/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
from django.db.models import QuerySet
from file_management.file_management_helper import FileManagerHelper
from prompt_studio.prompt_studio_core_v2.constants import DefaultPrompts
from utils.constants import FeatureFlag
from utils.file_storage.constants import FileStorageType
from utils.file_storage.helpers.common_file_helper import FileStorageHelper
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.models.base_model import BaseModel
from utils.models.organization_mixin import (
DefaultOrganizationManagerMixin,
DefaultOrganizationMixin,
)

from unstract.flags.feature_flag import check_feature_flag_status

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -133,20 +139,37 @@ class CustomTool(DefaultOrganizationMixin, BaseModel):

def delete(self, organization_id=None, *args, **kwargs):
# Delete the documents associated with the tool
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
if organization_id:
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_path = FileManagerHelper.handle_sub_directory_for_prompt_studio(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
if organization_id:
try:
shutil.rmtree(file_path)
except FileNotFoundError:
logger.error(f"The folder {file_path} does not exist.")
except OSError as e:
logger.error(f"Error: {file_path} : {e.strerror}")
# Continue with the deletion of the tool
else:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
file_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
try:
shutil.rmtree(file_path)
fs_instance.rm(file_path, True)
except FileNotFoundError:
logger.error(f"The folder {file_path} does not exist.")
except OSError as e:
logger.error(f"Error: {file_path} : {e.strerror}")
# Continue with the deletion of the tool
# Supressed to handle cases when the remote
# file is missing or already deleted
pass
super().delete(*args, **kwargs)

class Meta:
Expand Down
117 changes: 85 additions & 32 deletions backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,18 @@
from prompt_studio.prompt_studio_v2.models import ToolStudioPrompt
from unstract.sdk.constants import LogLevel
from unstract.sdk.exceptions import IndexingError, SdkError
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
from unstract.sdk.index import Index
from unstract.sdk.prompt import PromptTool
from unstract.sdk.utils.tool_utils import ToolUtils
from utils.constants import FeatureFlag
from utils.file_storage.constants import FileStorageType
from utils.file_storage.helpers.common_file_helper import FileStorageHelper
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.local_context import StateStore

from unstract.core.pubsub_helper import LogPublisher
from unstract.flags.feature_flag import check_feature_flag_status

CHOICES_JSON = "/static/select_choices.json"
ERROR_MSG = "User %s doesn't have access to adapter %s"
Expand Down Expand Up @@ -332,12 +338,22 @@ def index_document(
file_path = file_name
else:
default_profile = ProfileManager.get_default_llm_profile(tool)
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
else:
file_path = (
PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
)
file_path = str(Path(file_path) / file_name)

if not tool:
Expand Down Expand Up @@ -615,24 +631,40 @@ def _execute_prompts_in_single_pass(

@staticmethod
def _get_document_path(org_id, user_id, tool_id, doc_name):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
else:
doc_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
return str(Path(doc_path) / doc_name)

@staticmethod
def _get_extract_or_summary_document_path(
org_id, user_id, tool_id, doc_name, doc_type
) -> str:
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
else:
doc_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
extracted_doc_name = Path(doc_name).stem + TSPKeys.TXT_EXTENTION
return str(Path(doc_path) / doc_type / extracted_doc_name)

Expand Down Expand Up @@ -706,7 +738,6 @@ def _fetch_response(
monitor_llm: Optional[str] = None
challenge_llm_instance: Optional[AdapterInstance] = tool.challenge_llm
challenge_llm: Optional[str] = None

if monitor_llm_instance:
monitor_llm = str(monitor_llm_instance.id)
else:
Expand Down Expand Up @@ -734,17 +765,34 @@ def _fetch_response(
x2text = str(profile_manager.x2text.id)
if not profile_manager:
raise DefaultProfileError()
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
)
if check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
fs=fs_instance,
)
else:
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
)
if index_result.get("status") == IndexingStatus.PENDING_STATUS.value:
return {
"status": IndexingStatus.PENDING_STATUS.value,
Expand Down Expand Up @@ -814,8 +862,10 @@ def _fetch_response(
tool_settings[TSPKeys.PLATFORM_POSTAMBLE] = getattr(
settings, TSPKeys.PLATFORM_POSTAMBLE.upper(), ""
)

file_hash = ToolUtils.get_hash_from_file(file_path=doc_path)
if check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_hash = ToolUtils.get_hash_from_file(file_path=doc_path, fs=fs_instance)
else:
file_hash = ToolUtils.get_hash_from_file(file_path=doc_path)

payload = {
TSPKeys.TOOL_SETTINGS: tool_settings,
Expand Down Expand Up @@ -895,6 +945,7 @@ def dynamic_indexer(
reindex: bool = False,
run_id: str = None,
process_text: Optional[Callable[[str], str]] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> Any:
"""Used to index a file based on the passed arguments.

Expand Down Expand Up @@ -941,6 +992,7 @@ def dynamic_indexer(
chunk_overlap=str(profile_manager.chunk_overlap),
file_path=file_path,
file_hash=None,
fs=fs,
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
)
if not reindex:
indexed_doc_id = DocumentIndexingService.get_indexed_document_id(
Expand Down Expand Up @@ -976,6 +1028,7 @@ def dynamic_indexer(
output_file_path=extract_file_path,
usage_kwargs=usage_kwargs.copy(),
process_text=process_text,
fs=fs,
harini-venkataraman marked this conversation as resolved.
Show resolved Hide resolved
)

PromptStudioIndexHelper.handle_index_manager(
Expand Down
Loading