Skip to content

Commit

Permalink
[FEAT] Remote File Storage for Prompt Studio (#827)
Browse files Browse the repository at this point in the history
* Add more tests

* Commit pdm.lock changes

* Check in with improvements

* Commit pdm.lock changes

* Add permanent storage here

* Commit pdm.lock changes

* Add skeleton for temporary fs

* Initial commit for Prompt studio file storage changes

* Prompt studio changes

* Commit pdm.lock changes

* Commit pdm.lock changes

* Commit pdm.lock changes

* Handling legacy file path

* Remove redundant tests

* Remove redundant tests

* PDM Lock files

* Encoding fixes for prompt studio k8 changes

* Reverting changes for v2

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Commit pdm.lock changes

* Revert unused utils

* Bump version to use rc2

* Change to legacy envs

* Change to legacy envs

* Remove unused exceptions

* Revert unused changes

* Enable feature flag for indexing

* Commit pdm.lock changes

* Adding feature flags to dynamic indexer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Addition of doc strings

* Changing imports for Feature Flags

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reverting SDK to rc1

* Commit pdm.lock changes

* Roll back pdm lock file changes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: harini-venkataraman <[email protected]>
Co-authored-by: gayathrivijayakumar <[email protected]>
Co-authored-by: gaya3-zipstack <[email protected]>
Co-authored-by: Kirtiman Mishra <[email protected]>
Co-authored-by: kirtimanmishrazipstack <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: harini-venkataraman <[email protected]>
Co-authored-by: Gayathri <[email protected]>
  • Loading branch information
9 people authored Dec 2, 2024
1 parent 2ea6cf0 commit 6a0a856
Show file tree
Hide file tree
Showing 15 changed files with 1,642 additions and 1,203 deletions.
2 changes: 2 additions & 0 deletions backend/file_management/file_management_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import magic
from connector_v2.models import ConnectorInstance
from deprecated import deprecated
from django.conf import settings
from django.http import StreamingHttpResponse
from file_management.exceptions import (
Expand Down Expand Up @@ -141,6 +142,7 @@ def upload_file(
remote_file.write(file.read())

@staticmethod
@deprecated(reason="Use remote FS APIs from SDK")
def fetch_file_contents(file_system: UnstractFileSystem, file_path: str) -> Any:
fs = file_system.get_fsspec_fs()
try:
Expand Down
764 changes: 394 additions & 370 deletions backend/pdm.lock

Large diffs are not rendered by default.

47 changes: 35 additions & 12 deletions backend/prompt_studio/prompt_studio_core_v2/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
from django.db.models import QuerySet
from file_management.file_management_helper import FileManagerHelper
from prompt_studio.prompt_studio_core_v2.constants import DefaultPrompts
from utils.file_storage.constants import FileStorageType
from utils.file_storage.helpers.common_file_helper import FileStorageHelper
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.models.base_model import BaseModel
from utils.models.organization_mixin import (
DefaultOrganizationManagerMixin,
DefaultOrganizationMixin,
)

from backend.constants import FeatureFlag
from unstract.flags.feature_flag import check_feature_flag_status

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -133,20 +139,37 @@ class CustomTool(DefaultOrganizationMixin, BaseModel):

def delete(self, organization_id=None, *args, **kwargs):
# Delete the documents associated with the tool
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
if organization_id:
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_path = FileManagerHelper.handle_sub_directory_for_prompt_studio(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
if organization_id:
try:
shutil.rmtree(file_path)
except FileNotFoundError:
logger.error(f"The folder {file_path} does not exist.")
except OSError as e:
logger.error(f"Error: {file_path} : {e.strerror}")
# Continue with the deletion of the tool
else:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
file_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
organization_id,
is_create=False,
user_id=self.created_by.user_id,
tool_id=str(self.tool_id),
)
try:
shutil.rmtree(file_path)
fs_instance.rm(file_path, True)
except FileNotFoundError:
logger.error(f"The folder {file_path} does not exist.")
except OSError as e:
logger.error(f"Error: {file_path} : {e.strerror}")
# Continue with the deletion of the tool
# Supressed to handle cases when the remote
# file is missing or already deleted
pass
super().delete(*args, **kwargs)

class Meta:
Expand Down
187 changes: 133 additions & 54 deletions backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,18 @@
from prompt_studio.prompt_studio_v2.models import ToolStudioPrompt
from unstract.sdk.constants import LogLevel
from unstract.sdk.exceptions import IndexingError, SdkError
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
from unstract.sdk.index import Index
from unstract.sdk.prompt import PromptTool
from unstract.sdk.utils.tool_utils import ToolUtils
from utils.file_storage.constants import FileStorageType
from utils.file_storage.helpers.common_file_helper import FileStorageHelper
from utils.file_storage.helpers.prompt_studio_file_helper import PromptStudioFileHelper
from utils.local_context import StateStore

from backend.constants import FeatureFlag
from unstract.core.pubsub_helper import LogPublisher
from unstract.flags.feature_flag import check_feature_flag_status

CHOICES_JSON = "/static/select_choices.json"
ERROR_MSG = "User %s doesn't have access to adapter %s"
Expand Down Expand Up @@ -332,12 +338,22 @@ def index_document(
file_path = file_name
else:
default_profile = ProfileManager.get_default_llm_profile(tool)
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
else:
file_path = (
PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id,
is_create=False,
user_id=user_id,
tool_id=tool_id,
)
)
file_path = str(Path(file_path) / file_name)

if not tool:
Expand Down Expand Up @@ -615,24 +631,40 @@ def _execute_prompts_in_single_pass(

@staticmethod
def _get_document_path(org_id, user_id, tool_id, doc_name):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
else:
doc_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
return str(Path(doc_path) / doc_name)

@staticmethod
def _get_extract_or_summary_document_path(
org_id, user_id, tool_id, doc_name, doc_type
) -> str:
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_path = FileManagerHelper.handle_sub_directory_for_tenants(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
else:
doc_path = PromptStudioFileHelper.get_or_create_prompt_studio_subdirectory(
org_id=org_id,
user_id=user_id,
tool_id=tool_id,
is_create=False,
)
extracted_doc_name = Path(doc_name).stem + TSPKeys.TXT_EXTENTION
return str(Path(doc_path) / doc_type / extracted_doc_name)

Expand Down Expand Up @@ -706,7 +738,6 @@ def _fetch_response(
monitor_llm: Optional[str] = None
challenge_llm_instance: Optional[AdapterInstance] = tool.challenge_llm
challenge_llm: Optional[str] = None

if monitor_llm_instance:
monitor_llm = str(monitor_llm_instance.id)
else:
Expand Down Expand Up @@ -734,17 +765,34 @@ def _fetch_response(
x2text = str(profile_manager.x2text.id)
if not profile_manager:
raise DefaultProfileError()
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
)
else:
fs_instance = FileStorageHelper.initialize_file_storage(
type=FileStorageType.PERMANENT
)
index_result = PromptStudioHelper.dynamic_indexer(
profile_manager=profile_manager,
file_path=doc_path,
tool_id=str(tool.tool_id),
org_id=org_id,
document_id=document_id,
is_summary=tool.summarize_as_source,
run_id=run_id,
user_id=user_id,
process_text=process_text,
fs=fs_instance,
)
if index_result.get("status") == IndexingStatus.PENDING_STATUS.value:
return {
"status": IndexingStatus.PENDING_STATUS.value,
Expand Down Expand Up @@ -814,8 +862,10 @@ def _fetch_response(
tool_settings[TSPKeys.PLATFORM_POSTAMBLE] = getattr(
settings, TSPKeys.PLATFORM_POSTAMBLE.upper(), ""
)

file_hash = ToolUtils.get_hash_from_file(file_path=doc_path)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
file_hash = ToolUtils.get_hash_from_file(file_path=doc_path)
else:
file_hash = ToolUtils.get_hash_from_file(file_path=doc_path, fs=fs_instance)

payload = {
TSPKeys.TOOL_SETTINGS: tool_settings,
Expand Down Expand Up @@ -895,6 +945,7 @@ def dynamic_indexer(
reindex: bool = False,
run_id: str = None,
process_text: Optional[Callable[[str], str]] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> Any:
"""Used to index a file based on the passed arguments.
Expand Down Expand Up @@ -933,15 +984,27 @@ def dynamic_indexer(
usage_kwargs["file_name"] = filename
util = PromptIdeBaseTool(log_level=LogLevel.INFO, org_id=org_id)
tool_index = Index(tool=util)
doc_id_key = tool_index.generate_index_key(
vector_db=vector_db,
embedding=embedding_model,
x2text=x2text_adapter,
chunk_size=str(profile_manager.chunk_size),
chunk_overlap=str(profile_manager.chunk_overlap),
file_path=file_path,
file_hash=None,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_id_key = tool_index.generate_index_key(
vector_db=vector_db,
embedding=embedding_model,
x2text=x2text_adapter,
chunk_size=str(profile_manager.chunk_size),
chunk_overlap=str(profile_manager.chunk_overlap),
file_path=file_path,
file_hash=None,
)
else:
doc_id_key = tool_index.generate_index_key(
vector_db=vector_db,
embedding=embedding_model,
x2text=x2text_adapter,
chunk_size=str(profile_manager.chunk_size),
chunk_overlap=str(profile_manager.chunk_overlap),
file_path=file_path,
file_hash=None,
fs=fs,
)
if not reindex:
indexed_doc_id = DocumentIndexingService.get_indexed_document_id(
org_id=org_id, user_id=user_id, doc_id_key=doc_id_key
Expand All @@ -964,19 +1027,35 @@ def dynamic_indexer(
DocumentIndexingService.set_document_indexing(
org_id=org_id, user_id=user_id, doc_id_key=doc_id_key
)
doc_id: str = tool_index.index(
tool_id=tool_id,
embedding_instance_id=embedding_model,
vector_db_instance_id=vector_db,
x2text_instance_id=x2text_adapter,
file_path=file_path,
chunk_size=profile_manager.chunk_size,
chunk_overlap=profile_manager.chunk_overlap,
reindex=reindex,
output_file_path=extract_file_path,
usage_kwargs=usage_kwargs.copy(),
process_text=process_text,
)
if not check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE):
doc_id: str = tool_index.index(
tool_id=tool_id,
embedding_instance_id=embedding_model,
vector_db_instance_id=vector_db,
x2text_instance_id=x2text_adapter,
file_path=file_path,
chunk_size=profile_manager.chunk_size,
chunk_overlap=profile_manager.chunk_overlap,
reindex=reindex,
output_file_path=extract_file_path,
usage_kwargs=usage_kwargs.copy(),
process_text=process_text,
)
else:
doc_id: str = tool_index.index(
tool_id=tool_id,
embedding_instance_id=embedding_model,
vector_db_instance_id=vector_db,
x2text_instance_id=x2text_adapter,
file_path=file_path,
chunk_size=profile_manager.chunk_size,
chunk_overlap=profile_manager.chunk_overlap,
reindex=reindex,
output_file_path=extract_file_path,
usage_kwargs=usage_kwargs.copy(),
process_text=process_text,
fs=fs,
)

PromptStudioIndexHelper.handle_index_manager(
document_id=document_id,
Expand Down
Loading

0 comments on commit 6a0a856

Please sign in to comment.