Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from datasets import load_dataset
|
3 |
+
import chromadb
|
4 |
+
import string
|
5 |
+
|
6 |
+
from openai import OpenAI
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
from scipy.spatial.distance import cosine
|
12 |
+
|
13 |
+
from typing import Dict, List
|
14 |
+
|
15 |
+
def merge_dataframes(dataframes):
|
16 |
+
# Concatenate the list of dataframes
|
17 |
+
combined_dataframe = pd.concat(dataframes, ignore_index=True)
|
18 |
+
|
19 |
+
# Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
|
20 |
+
combined_dataframe = combined_dataframe[['context', 'questions', 'answers']]
|
21 |
+
|
22 |
+
return combined_dataframe
|
23 |
+
|
24 |
+
def call_chatgpt(prompt: str) -> str:
|
25 |
+
"""
|
26 |
+
Uses the OpenAI API to generate an AI response to a prompt.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
prompt: A string representing the prompt to send to the OpenAI API.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
A string representing the AI's generated response.
|
33 |
+
|
34 |
+
"""
|
35 |
+
|
36 |
+
# Use the OpenAI API to generate a response based on the input prompt.
|
37 |
+
client = OpenAI(api_key = "123")
|
38 |
+
|
39 |
+
completion = client.chat.completions.create(
|
40 |
+
model="gpt-3.5-turbo-0125",
|
41 |
+
messages=[
|
42 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
43 |
+
{"role": "user", "content": prompt}
|
44 |
+
]
|
45 |
+
)
|
46 |
+
|
47 |
+
# Extract the text from the first (and only) choice in the response output.
|
48 |
+
ans = completion.choices[0].message.content
|
49 |
+
|
50 |
+
# Return the generated AI response.
|
51 |
+
return ans
|
52 |
+
|
53 |
+
def openai_text_embedding(prompt: str) -> str:
|
54 |
+
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
55 |
+
"data"
|
56 |
+
][0]["embedding"]
|
57 |
+
|
58 |
+
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
59 |
+
# Compute sentence embeddings
|
60 |
+
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
61 |
+
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
62 |
+
|
63 |
+
# Convert to array
|
64 |
+
embedding1 = np.asarray(embedding1)
|
65 |
+
embedding2 = np.asarray(embedding2)
|
66 |
+
|
67 |
+
# Calculate cosine similarity between the embeddings
|
68 |
+
similarity_score = 1 - cosine(embedding1, embedding2)
|
69 |
+
|
70 |
+
return similarity_score
|
71 |
+
|
72 |
+
def add_dist_score_column(
|
73 |
+
dataframe: pd.DataFrame, sentence: str,
|
74 |
+
) -> pd.DataFrame:
|
75 |
+
dataframe["stsopenai"] = dataframe["questions"].apply(
|
76 |
+
lambda x: calculate_sts_openai_score(str(x), sentence)
|
77 |
+
)
|
78 |
+
|
79 |
+
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
80 |
+
|
81 |
+
|
82 |
+
return sorted_dataframe.iloc[:5, :]
|
83 |
+
|
84 |
+
def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
85 |
+
"""
|
86 |
+
Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
|
87 |
+
|
88 |
+
Args:
|
89 |
+
df: A pandas DataFrame with columns named 'questions' and 'answers'.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
|
93 |
+
"""
|
94 |
+
|
95 |
+
# Initialize an empty list to store the dictionaries
|
96 |
+
result = []
|
97 |
+
|
98 |
+
# Loop through each row of the DataFrame
|
99 |
+
for index, row in df.iterrows():
|
100 |
+
# Create a dictionary with the current question and answer
|
101 |
+
qa_dict_quest = {"role": "user", "content": row["questions"]}
|
102 |
+
qa_dict_ans = {"role": "assistant", "content": row["answers"]}
|
103 |
+
|
104 |
+
# Add the dictionary to the result list
|
105 |
+
result.append(qa_dict_quest)
|
106 |
+
result.append(qa_dict_ans)
|
107 |
+
|
108 |
+
# Return the list of dictionaries
|
109 |
+
return result
|
110 |
+
|
111 |
+
st.sidebar.markdown("""This is an app to help you navigate the websites of YSA/Larkin Street""")
|
112 |
+
|
113 |
+
org = st.sidebar.selectbox("Which website do you want to ask?", ("YSA", "Larkin"))
|
114 |
+
|
115 |
+
if org == "YSA":
|
116 |
+
domain = st.sidebar.selectbox("What do you want to learn about?", ("About Us: Our Mission and Programs", "The Tiny House Empowerment Village", "How to Qualify/Apply to the Tiny House Village", "Our Team and Youth Leaders", "Our Supporters"))
|
117 |
+
if org == "Larkin":
|
118 |
+
domain = st.sidebar.selectbox("What do you want to learn about?", ("Domain1", "Domain2"))
|
119 |
+
|
120 |
+
special_threshold = st.sidebar.number_input(
|
121 |
+
"Insert a threshold for distances score to filter data (default 0.2):",
|
122 |
+
value=0.2,
|
123 |
+
placeholder="Type a number...",
|
124 |
+
)
|
125 |
+
|
126 |
+
n_results = st.sidebar.slider(
|
127 |
+
"Insert n-results (default 5)",
|
128 |
+
0, 10, 5
|
129 |
+
)
|
130 |
+
|
131 |
+
clear_button = st.sidebar.button("Clear Conversation", key="clear")
|
132 |
+
|
133 |
+
if clear_button:
|
134 |
+
st.session_state.messages = []
|
135 |
+
|
136 |
+
# Load the dataset from a provided source.
|
137 |
+
if domain == "About Us: Our Mission and Programs":
|
138 |
+
dataset = load_dataset(
|
139 |
+
"KeshavRa/About_YSA_Database"
|
140 |
+
)
|
141 |
+
elif domain == "The Tiny House Empowerment Village":
|
142 |
+
dataset = load_dataset(
|
143 |
+
"KeshavRa/Tiny_House_Village_Database"
|
144 |
+
)
|
145 |
+
elif domain == "How to Qualify/Apply for the Tiny House Village":
|
146 |
+
dataset = load_dataset(
|
147 |
+
"KeshavRa/Qualify_Apply_For_Village_Database"
|
148 |
+
)
|
149 |
+
elif domain == "Our Team and Youth Leaders":
|
150 |
+
dataset = load_dataset(
|
151 |
+
"KeshavRa/Our_Team_Youth_Leaders_Database"
|
152 |
+
)
|
153 |
+
elif domain == "Our Supporters":
|
154 |
+
dataset = load_dataset(
|
155 |
+
"KeshavRa/YSA_Supporters_Database"
|
156 |
+
)
|
157 |
+
else:
|
158 |
+
dataset = load_dataset(
|
159 |
+
"eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted"
|
160 |
+
)
|
161 |
+
initial_input = "Tell me about YSA"
|
162 |
+
|
163 |
+
# Initialize a new client for ChromeDB.
|
164 |
+
client = chromadb.Client()
|
165 |
+
|
166 |
+
# Generate a random number between 1 billion and 10 billion.
|
167 |
+
random_number: int = np.random.randint(low=1e9, high=1e10)
|
168 |
+
|
169 |
+
# Generate a random string consisting of 10 uppercase letters and digits.
|
170 |
+
random_string: str = "".join(
|
171 |
+
np.random.choice(list(string.ascii_uppercase + string.digits), size=10)
|
172 |
+
)
|
173 |
+
|
174 |
+
# Combine the random number and random string into one identifier.
|
175 |
+
combined_string: str = f"{random_number}{random_string}"
|
176 |
+
|
177 |
+
# Create a new collection in ChromeDB with the combined string as its name.
|
178 |
+
collection = client.create_collection(combined_string)
|
179 |
+
|
180 |
+
|
181 |
+
# Embed and store the first N supports for this demo
|
182 |
+
with st.spinner("Loading, please be patient with us ... 🙏"):
|
183 |
+
L = len(dataset["train"]["questions"])
|
184 |
+
collection.add(
|
185 |
+
ids=[str(i) for i in range(0, L)], # IDs are just strings
|
186 |
+
documents=dataset["train"]["questions"], # Enter questions here
|
187 |
+
metadatas=[{"type": "support"} for _ in range(0, L)],
|
188 |
+
)
|
189 |
+
db=collection
|
190 |
+
|
191 |
+
st.title("Youth Homelessness Chatbot")
|
192 |
+
|
193 |
+
# Initialize chat history
|
194 |
+
if "messages" not in st.session_state:
|
195 |
+
st.session_state.messages = []
|
196 |
+
|
197 |
+
# Display chat messages from history on app rerun
|
198 |
+
for message in st.session_state.messages:
|
199 |
+
with st.chat_message(message["role"]):
|
200 |
+
st.markdown(message["content"])
|
201 |
+
|
202 |
+
# React to user input
|
203 |
+
if prompt := st.chat_input("Tell me about YSA"):
|
204 |
+
# Display user message in chat message container
|
205 |
+
st.chat_message("user").markdown(prompt)
|
206 |
+
# Add user message to chat history
|
207 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
208 |
+
|
209 |
+
question = prompt
|
210 |
+
|
211 |
+
results = collection.query(query_texts=question, n_results=n_results)
|
212 |
+
|
213 |
+
idx = results["ids"][0]
|
214 |
+
idx = [int(i) for i in idx]
|
215 |
+
ref = pd.DataFrame(
|
216 |
+
{
|
217 |
+
"idx": idx,
|
218 |
+
"questions": [dataset["train"]["questions"][i] for i in idx],
|
219 |
+
"answers": [dataset["train"]["answers"][i] for i in idx],
|
220 |
+
"distances": results["distances"][0],
|
221 |
+
}
|
222 |
+
)
|
223 |
+
# special_threshold = st.sidebar.slider('How old are you?', 0, 0.6, 0.1) # 0.3
|
224 |
+
# special_threshold = 0.3
|
225 |
+
filtered_ref = ref[ref["distances"] < special_threshold]
|
226 |
+
if filtered_ref.shape[0] > 0:
|
227 |
+
st.success("There are highly relevant information in our database.")
|
228 |
+
ref_from_db_search = filtered_ref["answers"].str.cat(sep=" ")
|
229 |
+
final_ref = filtered_ref
|
230 |
+
else:
|
231 |
+
st.warning(
|
232 |
+
"The database may not have relevant information to help your question so please be aware of hallucinations."
|
233 |
+
)
|
234 |
+
ref_from_db_search = ref["answers"].str.cat(sep=" ")
|
235 |
+
final_ref = ref
|
236 |
+
|
237 |
+
engineered_prompt = f"""
|
238 |
+
Based on the context: {ref_from_db_search},
|
239 |
+
answer the user question: {question}.
|
240 |
+
"""
|
241 |
+
|
242 |
+
answer = call_chatgpt(engineered_prompt)
|
243 |
+
|
244 |
+
response = answer
|
245 |
+
# Display assistant response in chat message container
|
246 |
+
with st.chat_message("assistant"):
|
247 |
+
st.markdown(response)
|
248 |
+
with st.expander("See reference:"):
|
249 |
+
st.table(final_ref)
|
250 |
+
# Add assistant response to chat history
|
251 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|