eagle0504 commited on
Commit
7f85f4f
·
verified ·
1 Parent(s): f849363

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -0
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from datasets import load_dataset
3
+ import chromadb
4
+ import string
5
+
6
+ from openai import OpenAI
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from scipy.spatial.distance import cosine
12
+
13
+ from typing import Dict, List
14
+
15
+ def merge_dataframes(dataframes):
16
+ # Concatenate the list of dataframes
17
+ combined_dataframe = pd.concat(dataframes, ignore_index=True)
18
+
19
+ # Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
20
+ combined_dataframe = combined_dataframe[['context', 'questions', 'answers']]
21
+
22
+ return combined_dataframe
23
+
24
+ def call_chatgpt(prompt: str) -> str:
25
+ """
26
+ Uses the OpenAI API to generate an AI response to a prompt.
27
+
28
+ Args:
29
+ prompt: A string representing the prompt to send to the OpenAI API.
30
+
31
+ Returns:
32
+ A string representing the AI's generated response.
33
+
34
+ """
35
+
36
+ # Use the OpenAI API to generate a response based on the input prompt.
37
+ client = OpenAI(api_key = "123")
38
+
39
+ completion = client.chat.completions.create(
40
+ model="gpt-3.5-turbo-0125",
41
+ messages=[
42
+ {"role": "system", "content": "You are a helpful assistant."},
43
+ {"role": "user", "content": prompt}
44
+ ]
45
+ )
46
+
47
+ # Extract the text from the first (and only) choice in the response output.
48
+ ans = completion.choices[0].message.content
49
+
50
+ # Return the generated AI response.
51
+ return ans
52
+
53
+ def openai_text_embedding(prompt: str) -> str:
54
+ return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
55
+ "data"
56
+ ][0]["embedding"]
57
+
58
+ def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
59
+ # Compute sentence embeddings
60
+ embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
61
+ embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
62
+
63
+ # Convert to array
64
+ embedding1 = np.asarray(embedding1)
65
+ embedding2 = np.asarray(embedding2)
66
+
67
+ # Calculate cosine similarity between the embeddings
68
+ similarity_score = 1 - cosine(embedding1, embedding2)
69
+
70
+ return similarity_score
71
+
72
+ def add_dist_score_column(
73
+ dataframe: pd.DataFrame, sentence: str,
74
+ ) -> pd.DataFrame:
75
+ dataframe["stsopenai"] = dataframe["questions"].apply(
76
+ lambda x: calculate_sts_openai_score(str(x), sentence)
77
+ )
78
+
79
+ sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
80
+
81
+
82
+ return sorted_dataframe.iloc[:5, :]
83
+
84
+ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
85
+ """
86
+ Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
87
+
88
+ Args:
89
+ df: A pandas DataFrame with columns named 'questions' and 'answers'.
90
+
91
+ Returns:
92
+ A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
93
+ """
94
+
95
+ # Initialize an empty list to store the dictionaries
96
+ result = []
97
+
98
+ # Loop through each row of the DataFrame
99
+ for index, row in df.iterrows():
100
+ # Create a dictionary with the current question and answer
101
+ qa_dict_quest = {"role": "user", "content": row["questions"]}
102
+ qa_dict_ans = {"role": "assistant", "content": row["answers"]}
103
+
104
+ # Add the dictionary to the result list
105
+ result.append(qa_dict_quest)
106
+ result.append(qa_dict_ans)
107
+
108
+ # Return the list of dictionaries
109
+ return result
110
+
111
+ st.sidebar.markdown("""This is an app to help you navigate the websites of YSA/Larkin Street""")
112
+
113
+ org = st.sidebar.selectbox("Which website do you want to ask?", ("YSA", "Larkin"))
114
+
115
+ if org == "YSA":
116
+ domain = st.sidebar.selectbox("What do you want to learn about?", ("About Us: Our Mission and Programs", "The Tiny House Empowerment Village", "How to Qualify/Apply to the Tiny House Village", "Our Team and Youth Leaders", "Our Supporters"))
117
+ if org == "Larkin":
118
+ domain = st.sidebar.selectbox("What do you want to learn about?", ("Domain1", "Domain2"))
119
+
120
+ special_threshold = st.sidebar.number_input(
121
+ "Insert a threshold for distances score to filter data (default 0.2):",
122
+ value=0.2,
123
+ placeholder="Type a number...",
124
+ )
125
+
126
+ n_results = st.sidebar.slider(
127
+ "Insert n-results (default 5)",
128
+ 0, 10, 5
129
+ )
130
+
131
+ clear_button = st.sidebar.button("Clear Conversation", key="clear")
132
+
133
+ if clear_button:
134
+ st.session_state.messages = []
135
+
136
+ # Load the dataset from a provided source.
137
+ if domain == "About Us: Our Mission and Programs":
138
+ dataset = load_dataset(
139
+ "KeshavRa/About_YSA_Database"
140
+ )
141
+ elif domain == "The Tiny House Empowerment Village":
142
+ dataset = load_dataset(
143
+ "KeshavRa/Tiny_House_Village_Database"
144
+ )
145
+ elif domain == "How to Qualify/Apply for the Tiny House Village":
146
+ dataset = load_dataset(
147
+ "KeshavRa/Qualify_Apply_For_Village_Database"
148
+ )
149
+ elif domain == "Our Team and Youth Leaders":
150
+ dataset = load_dataset(
151
+ "KeshavRa/Our_Team_Youth_Leaders_Database"
152
+ )
153
+ elif domain == "Our Supporters":
154
+ dataset = load_dataset(
155
+ "KeshavRa/YSA_Supporters_Database"
156
+ )
157
+ else:
158
+ dataset = load_dataset(
159
+ "eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted"
160
+ )
161
+ initial_input = "Tell me about YSA"
162
+
163
+ # Initialize a new client for ChromeDB.
164
+ client = chromadb.Client()
165
+
166
+ # Generate a random number between 1 billion and 10 billion.
167
+ random_number: int = np.random.randint(low=1e9, high=1e10)
168
+
169
+ # Generate a random string consisting of 10 uppercase letters and digits.
170
+ random_string: str = "".join(
171
+ np.random.choice(list(string.ascii_uppercase + string.digits), size=10)
172
+ )
173
+
174
+ # Combine the random number and random string into one identifier.
175
+ combined_string: str = f"{random_number}{random_string}"
176
+
177
+ # Create a new collection in ChromeDB with the combined string as its name.
178
+ collection = client.create_collection(combined_string)
179
+
180
+
181
+ # Embed and store the first N supports for this demo
182
+ with st.spinner("Loading, please be patient with us ... 🙏"):
183
+ L = len(dataset["train"]["questions"])
184
+ collection.add(
185
+ ids=[str(i) for i in range(0, L)], # IDs are just strings
186
+ documents=dataset["train"]["questions"], # Enter questions here
187
+ metadatas=[{"type": "support"} for _ in range(0, L)],
188
+ )
189
+ db=collection
190
+
191
+ st.title("Youth Homelessness Chatbot")
192
+
193
+ # Initialize chat history
194
+ if "messages" not in st.session_state:
195
+ st.session_state.messages = []
196
+
197
+ # Display chat messages from history on app rerun
198
+ for message in st.session_state.messages:
199
+ with st.chat_message(message["role"]):
200
+ st.markdown(message["content"])
201
+
202
+ # React to user input
203
+ if prompt := st.chat_input("Tell me about YSA"):
204
+ # Display user message in chat message container
205
+ st.chat_message("user").markdown(prompt)
206
+ # Add user message to chat history
207
+ st.session_state.messages.append({"role": "user", "content": prompt})
208
+
209
+ question = prompt
210
+
211
+ results = collection.query(query_texts=question, n_results=n_results)
212
+
213
+ idx = results["ids"][0]
214
+ idx = [int(i) for i in idx]
215
+ ref = pd.DataFrame(
216
+ {
217
+ "idx": idx,
218
+ "questions": [dataset["train"]["questions"][i] for i in idx],
219
+ "answers": [dataset["train"]["answers"][i] for i in idx],
220
+ "distances": results["distances"][0],
221
+ }
222
+ )
223
+ # special_threshold = st.sidebar.slider('How old are you?', 0, 0.6, 0.1) # 0.3
224
+ # special_threshold = 0.3
225
+ filtered_ref = ref[ref["distances"] < special_threshold]
226
+ if filtered_ref.shape[0] > 0:
227
+ st.success("There are highly relevant information in our database.")
228
+ ref_from_db_search = filtered_ref["answers"].str.cat(sep=" ")
229
+ final_ref = filtered_ref
230
+ else:
231
+ st.warning(
232
+ "The database may not have relevant information to help your question so please be aware of hallucinations."
233
+ )
234
+ ref_from_db_search = ref["answers"].str.cat(sep=" ")
235
+ final_ref = ref
236
+
237
+ engineered_prompt = f"""
238
+ Based on the context: {ref_from_db_search},
239
+ answer the user question: {question}.
240
+ """
241
+
242
+ answer = call_chatgpt(engineered_prompt)
243
+
244
+ response = answer
245
+ # Display assistant response in chat message container
246
+ with st.chat_message("assistant"):
247
+ st.markdown(response)
248
+ with st.expander("See reference:"):
249
+ st.table(final_ref)
250
+ # Add assistant response to chat history
251
+ st.session_state.messages.append({"role": "assistant", "content": response})