Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge main to colbert #3

Open
wants to merge 26 commits into
base: colbert
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
experiments
colbert.ipynb
test.ipynb
.venv
__pycache__
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
__pycache__
.env
.venv
src/prepline_sec_filings/__pycache__
experiments
colbert-env
sec-earnings-call
sec-earnings-call
sec-earnings-call-openai
17 changes: 17 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.11-slim-bullseye

COPY ./requirements.txt .

# RUN apt-get update && apt-get install -y
RUN pip3 install --upgrade pip

RUN pip3 --no-cache-dir install -r requirements.txt

WORKDIR /src
COPY . /src
EXPOSE 8501

HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health

ENTRYPOINT [ "streamlit", "run" ]
CMD [ "Intro.py", "--server.fileWatcherType", "none", "--browser.gatherUsageStats", "false", "--server.address", "0.0.0.0"]
17 changes: 17 additions & 0 deletions Dockerfile_Streamlit
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.11-slim-bullseye

COPY ./requirements.txt .

# RUN apt-get update && apt-get install -y
RUN pip3 install --upgrade pip

RUN pip3 --no-cache-dir install -r requirements.txt

WORKDIR /src
COPY . /src
EXPOSE 8501

HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health

ENTRYPOINT [ "streamlit", "run" ]
CMD [ "Intro.py", "--server.fileWatcherType", "none", "--browser.gatherUsageStats", "false", "--server.address", "0.0.0.0"]
16 changes: 13 additions & 3 deletions Intro.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
import streamlit as st
from src.vectorDatabase import create_database
from src.vectorDatabaseDocker import create_database
from datetime import datetime

curr_year = datetime.now().year
ticker = st.text_input(label="Ticker")
year = st.text_input(label="Year")

if year != "":
int_year = int(float(year))

if ticker != "" and year != "":
submit_button = st.button(label="Submit")
if ticker != "" and year != "" and submit_button:
if curr_year == int_year:
curr_year_bool = True
else:
curr_year_bool = False
(
qdrant_client,
encoder,
speakers_list_1,
speakers_list_2,
speakers_list_3,
speakers_list_4,
sec_form_names,
earnings_call_quarter_vals,
) = create_database(ticker=ticker, year=int_year)
st.write("Created the database")

st.session_state["ticker"] = ticker
st.session_state["year"] = str(year)
st.session_state["qdrant_client"] = qdrant_client
st.session_state["encoder"] = encoder
st.session_state["speaker_list_1"] = speakers_list_1
st.session_state["speaker_list_2"] = speakers_list_2
st.session_state["speaker_list_3"] = speakers_list_3
st.session_state["speaker_list_4"] = speakers_list_4
st.session_state["sec_form_names"] = sec_form_names
st.session_state["earnings_call_quarter_vals"] = earnings_call_quarter_vals
1,375 changes: 1,375 additions & 0 deletions Modal.ipynb

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## INSTRUCTIONS TO RUN

1. Start the docker container for qdrant by running (See more instructions [here](https://qdrant.tech/documentation/guides/installation/#docker-and-docker-compose))
```
docker run -p 6333:6333 \
-v $(pwd)/path/to/data:/qdrant/storage \
qdrant/qdrant
```

2. Run the requirements file for installing the packages

3. Run the streamlit file for the demo

```
streamlit run Intro.py
```


29 changes: 28 additions & 1 deletion colbert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,33 @@
"sd = build_index(\"AAPL\",2023)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"y\n"
]
}
],
"source": [
"import os\n",
"from src.config import *\n",
"from qdrant_client import QdrantClient\n",
"\n",
"QdrantClient(\n",
" \n",
")\n",
"ticker = \"AAPL\"\n",
"year = 2023\n",
"\n",
"if os.path.exists(f\"{DATABASE_FOLDER}/{ticker}-{year}-db\"):\n"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1406,7 +1433,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
"version": "3.8.0"
}
},
"nbformat": 4,
Expand Down
117 changes: 117 additions & 0 deletions modal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pathlib

import modal

with open("requirements.txt") as f:
r = f.read()

packages_list = r.split("\n")

image = (
modal.Image.debian_slim()
.apt_install("git")
.pip_install(
"aiohttp==3.8.5",
"fastapi==0.109.2",
"langchain==0.1.5",
"nltk==3.8.1",
"numpy==1.25.0",
"openai==0.27.8",
"pandas==2.0.3",
"# pydantic==1.10.12",
"python-dotenv==1.0.1",
"qdrant_client==1.7.3",
"ratelimit==2.2.1",
"Requests==2.31.0",
"scikit_learn==1.3.0",
"sentence_transformers==2.3.1",
"starlette==0.36.3",
"streamlit==1.24.1",
"tenacity==8.2.2",
"torch==2.0.1",
"tqdm==4.65.0",
"typing_extensions==4.9.0",
"unstructured==0.8.1",
"uvicorn",
"streamlit-feedback==0.1.3",
)
# Use fork until https://github.com/valohai/asgiproxy/pull/11 is merged.
.pip_install("git+https://github.com/modal-labs/asgiproxy.git")
)

stub = modal.Stub(name="Financial-Docs-LLM", image=image)

streamlit_script_local_path = pathlib.Path(__file__).parent / "Intro.py"
streamlit_script_remote_path = pathlib.Path("/root/Intro.py")

if not streamlit_script_local_path.exists():
raise RuntimeError(
"Intro.py not found! Place the script with your streamlit app in the same directory."
)

streamlit_script_mount = modal.Mount.from_local_file(
streamlit_script_local_path,
streamlit_script_remote_path,
)

HOST = "127.0.0.1"
PORT = "8000"


def spawn_server():
import socket
import subprocess

process = subprocess.Popen(
[
"streamlit",
"run",
str(streamlit_script_remote_path),
"--browser.serverAddress",
HOST,
"--server.port",
PORT,
"--browser.serverPort",
PORT,
"--server.enableCORS",
"false",
]
)

# Poll until webserver accepts connections before running inputs.
while True:
try:
socket.create_connection((HOST, int(PORT)), timeout=1).close()
print("Webserver ready!")
return process
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = process.poll()
if retcode is not None:
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")


@stub.function(
# Allows 100 concurrent requests per container.
allow_concurrent_inputs=100,
mounts=[streamlit_script_mount],
)
@modal.asgi_app()
def run():
from asgiproxy.config import BaseURLProxyConfigMixin, ProxyConfig
from asgiproxy.context import ProxyContext
from asgiproxy.simple_proxy import make_simple_proxy_app

spawn_server()

config = type(
"Config",
(BaseURLProxyConfigMixin, ProxyConfig),
{
"upstream_base_url": f"http://{HOST}:{PORT}",
"rewrite_host_header": f"{HOST}:{PORT}",
},
)()
proxy_context = ProxyContext(config)
return make_simple_proxy_app(proxy_context)
14 changes: 13 additions & 1 deletion pages/EarningsChat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from src.chat_earnings_call import get_openai_answer_earnings_call
from src.queryDatabase import query_database_earnings_call

from streamlit_feedback import streamlit_feedback
import streamlit as st
from dotenv import load_dotenv
import openai
Expand All @@ -17,10 +17,15 @@
speaker_list_1 = st.session_state["speaker_list_1"]
speakers_list_2 = st.session_state["speaker_list_2"]
speakers_list_3 = st.session_state["speaker_list_3"]
speakers_list_4 = st.session_state["speaker_list_4"]
earnings_call_quarter_vals = st.session_state["earnings_call_quarter_vals"]
quarter = st.selectbox("Quarter Name", tuple(earnings_call_quarter_vals))

st.session_state["quarter"] = quarter
ticker = st.session_state["ticker"]
year = st.session_state["year"]

st.title(f"{ticker}-{year}")


def generate_response(input_text):
Expand All @@ -31,6 +36,8 @@ def generate_response(input_text):
speakers_list = speakers_list_2
elif quarter == "Q3":
speakers_list = speakers_list_3
elif quarter == "Q4":
speakers_list = speakers_list_4

relevant_text = query_database_earnings_call(
input_text, quarter, qdrant_client, encoder, speakers_list
Expand Down Expand Up @@ -64,5 +71,10 @@ def generate_response(input_text):
st.write(docs)
expander = st.expander("See relevant sources")
expander.write(relevant_text)
feedback = streamlit_feedback(
feedback_type="thumbs",
optional_text_label="Please describe the feedback in detail",
)
# print(feedback)
message = {"role": "assistant", "content": docs}
st.session_state.messages.append(message)
10 changes: 9 additions & 1 deletion pages/SECChat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from src.chat_sec import get_openai_answer_sec
from src.queryDatabase import query_database_sec

from streamlit_feedback import streamlit_feedback
import streamlit as st
from dotenv import load_dotenv
import openai
Expand All @@ -16,6 +16,10 @@
sec_form_names = st.session_state["sec_form_names"]
form_name = st.selectbox("Form Name", tuple(sec_form_names))
st.session_state["form_name"] = form_name
ticker = st.session_state["ticker"]
year = st.session_state["year"]

st.title(f"{ticker}-{year}")


def generate_response(input_text):
Expand Down Expand Up @@ -52,5 +56,9 @@ def generate_response(input_text):
st.write(docs)
expander = st.expander("See relevant sources")
expander.write(relevant_text)
feedback = streamlit_feedback(
feedback_type="thumbs",
optional_text_label="Please describe the feedback in detail",
)
message = {"role": "assistant", "content": docs}
st.session_state.messages.append(message)
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
aiohttp==3.8.4
aiohttp==3.8.5
fastapi==0.109.2
langchain==0.1.5
nltk==3.8.1
numpy==1.25.0
openai==0.27.8
openai==1.26.0
pandas==2.0.3
pydantic==1.10.12
pydantic_settings==2.1.0
# pydantic==1.10.12
python-dotenv==1.0.1
qdrant_client==1.7.3
ratelimit==2.2.1
Requests==2.31.0
scikit_learn==1.3.0
sentence_transformers==2.3.1
starlette==0.37.0
starlette==0.36.3
streamlit==1.24.1
tenacity==8.2.2
torch==2.0.1
tqdm==4.65.0
typing_extensions==4.9.0
unstructured==0.8.1
uvicorn
streamlit-feedback==0.1.3
Loading