Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,313 +1,7 @@
|
|
1 |
import os
|
2 |
-
import re
|
3 |
-
import logging
|
4 |
-
import requests
|
5 |
-
import pandas as pd
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
-
from langdetect import detect, DetectorFactory
|
8 |
-
from langdetect.lang_detect_exception import LangDetectException
|
9 |
-
import langid
|
10 |
-
from deep_translator import GoogleTranslator
|
11 |
-
import gradio as gr
|
12 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
-
from langchain_community.vectorstores import Chroma
|
14 |
-
from langchain.docstore.document import Document
|
15 |
-
from langchain_community.vectorstores.utils import filter_complex_metadata
|
16 |
-
from langchain_core.prompts import ChatPromptTemplate
|
17 |
-
from langchain_core.pydantic_v1 import BaseModel, Field
|
18 |
-
from langchain_openai import ChatOpenAI
|
19 |
-
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
20 |
-
from langchain_core.output_parsers import StrOutputParser
|
21 |
-
from operator import itemgetter
|
22 |
-
from langchain_community.tools.tavily_search import TavilySearchResults
|
23 |
-
from typing import List
|
24 |
-
from typing_extensions import TypedDict
|
25 |
-
from langgraph.graph import END, StateGraph
|
26 |
-
from langchain_openai import OpenAIEmbeddings
|
27 |
-
from langchain_community.document_loaders import UnstructuredURLLoader
|
28 |
-
from langchain_community.vectorstores import FAISS
|
29 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
30 |
-
from langchain.memory import ConversationBufferMemory
|
31 |
-
from langchain.chains import create_retrieval_chain
|
32 |
-
from langchain.chains.combine_documents import create_stuff_documents_chain
|
33 |
-
from langchain.chains import create_history_aware_retriever
|
34 |
-
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
35 |
-
from langchain_core.messages import HumanMessage
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
# Retrieve the secret token from environment variables
|
44 |
-
hf_api_token = os.getenv('HF_API_TOKEN')
|
45 |
-
|
46 |
-
# Ensure the token is not None
|
47 |
-
if hf_api_token is None:
|
48 |
-
raise ValueError("HF_API_TOKEN environment variable not set")
|
49 |
-
|
50 |
-
# Fixing random seed for reproducibility in langdetect
|
51 |
-
DetectorFactory.seed = 0
|
52 |
-
|
53 |
-
# Function to translate text based on detected language
|
54 |
-
def translate_content(text):
|
55 |
-
try:
|
56 |
-
detected_lang = detect(text)
|
57 |
-
if detected_lang == 'fr':
|
58 |
-
return GoogleTranslator(source='fr', target='en').translate(text)
|
59 |
-
elif detected_lang == 'en':
|
60 |
-
return GoogleTranslator(source='en', target='fr').translate(text)
|
61 |
-
else:
|
62 |
-
return text
|
63 |
-
except Exception as e:
|
64 |
-
print(f"Error detecting language or translating: {e}")
|
65 |
-
return text
|
66 |
-
|
67 |
-
# Function to chunk content
|
68 |
-
def chunk_content(content, chunk_size=1250, overlap=250):
|
69 |
-
chunks = []
|
70 |
-
start = 0
|
71 |
-
while start < len(content):
|
72 |
-
end = start + chunk_size
|
73 |
-
chunk = content[start:end]
|
74 |
-
chunks.append(chunk)
|
75 |
-
start += chunk_size - overlap
|
76 |
-
return chunks
|
77 |
-
|
78 |
-
# Initialize the list to store chunked documents
|
79 |
-
chunked_web_doc = []
|
80 |
-
|
81 |
-
# Load the Excel file
|
82 |
-
df = pd.read_excel("UNTEanswers.xlsx")
|
83 |
-
|
84 |
-
# Merge the 'prompt' and 'reference' columns
|
85 |
-
df['merged_content'] = df['prompt'] + " " + df['reference']
|
86 |
-
|
87 |
-
# Translate and store all text entries in a list
|
88 |
-
text_entries = []
|
89 |
-
|
90 |
-
for index, row in df.iterrows():
|
91 |
-
# Original content
|
92 |
-
merged_content = row['merged_content']
|
93 |
-
text_entries.append(merged_content)
|
94 |
-
|
95 |
-
# Translated content
|
96 |
-
translated_content = translate_content(merged_content)
|
97 |
-
if translated_content and translated_content != merged_content:
|
98 |
-
text_entries.append(translated_content)
|
99 |
-
|
100 |
-
# Convert the list of text entries into a single string
|
101 |
-
excel_text = "\n".join(text_entries)
|
102 |
-
|
103 |
-
# Process content from the Excel file
|
104 |
-
for index, row in df.iterrows():
|
105 |
-
merged_content = row['merged_content']
|
106 |
-
|
107 |
-
# Chunk the original content
|
108 |
-
en_chunks = chunk_content(merged_content)
|
109 |
-
for chunk in en_chunks:
|
110 |
-
chunked_web_doc.append({
|
111 |
-
"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
|
112 |
-
"language": detect(merged_content),
|
113 |
-
"chunk": chunk
|
114 |
-
})
|
115 |
-
|
116 |
-
# Translate and chunk the content if necessary
|
117 |
-
translated_content = translate_content(merged_content)
|
118 |
-
if translated_content and translated_content != merged_content:
|
119 |
-
translated_chunks = chunk_content(translated_content)
|
120 |
-
for chunk in translated_chunks:
|
121 |
-
chunked_web_doc.append({
|
122 |
-
"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
|
123 |
-
"language": detect(translated_content),
|
124 |
-
"chunk": chunk
|
125 |
-
})
|
126 |
-
|
127 |
-
# Load the fetched content from the text file
|
128 |
-
with open('fetched_contentt.txt', 'r', encoding='utf-8') as f:
|
129 |
-
fetched_content = f.read()
|
130 |
-
|
131 |
-
# Combine the text from the Excel file and the fetched content
|
132 |
-
content = fetched_content + "\n" + excel_text
|
133 |
-
|
134 |
-
# Optionally, save the combined content to a new file
|
135 |
-
with open('merged_content.txt', 'w', encoding='utf-8') as f:
|
136 |
-
f.write(content)
|
137 |
-
|
138 |
-
|
139 |
-
web_contents = content.split("-" * 80 + "\n\n")
|
140 |
-
|
141 |
-
for block in web_contents:
|
142 |
-
if block.strip():
|
143 |
-
lines = block.strip().splitlines()
|
144 |
-
url = ""
|
145 |
-
title = ""
|
146 |
-
en_content = ""
|
147 |
-
fr_content = ""
|
148 |
-
language = None
|
149 |
-
|
150 |
-
for i, line in enumerate(lines):
|
151 |
-
if line.startswith("URL:"):
|
152 |
-
url = line.split("URL:")[1].strip()
|
153 |
-
elif line.startswith("Title:"):
|
154 |
-
title = line.split("Title:")[1].strip()
|
155 |
-
elif line == "English Content:":
|
156 |
-
language = "en"
|
157 |
-
elif line == "French Content:":
|
158 |
-
language = "fr"
|
159 |
-
else:
|
160 |
-
if language == "en":
|
161 |
-
en_content += line + "\n"
|
162 |
-
elif language == "fr":
|
163 |
-
fr_content += line + "\n"
|
164 |
-
|
165 |
-
if en_content.strip():
|
166 |
-
en_chunks = chunk_content(en_content.strip())
|
167 |
-
for chunk in en_chunks:
|
168 |
-
chunked_web_doc.append({
|
169 |
-
"url": url,
|
170 |
-
"language": "en",
|
171 |
-
"chunk": chunk
|
172 |
-
})
|
173 |
-
|
174 |
-
if fr_content.strip():
|
175 |
-
fr_chunks = chunk_content(fr_content.strip())
|
176 |
-
for chunk in fr_chunks:
|
177 |
-
chunked_web_doc.append({
|
178 |
-
"url": url,
|
179 |
-
"language": "fr",
|
180 |
-
"chunk": chunk
|
181 |
-
})
|
182 |
-
|
183 |
-
model_id = 'sentence-transformers/all-MiniLM-L6-v2'
|
184 |
-
model_kwargs = {'device': 'cpu'}
|
185 |
-
embeddings = HuggingFaceEmbeddings(
|
186 |
-
model_name=model_id,
|
187 |
-
model_kwargs=model_kwargs
|
188 |
-
)
|
189 |
-
|
190 |
-
documents = [
|
191 |
-
Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']})
|
192 |
-
for chunk in chunked_web_doc
|
193 |
-
]
|
194 |
-
|
195 |
-
chroma_db = Chroma.from_documents(documents=documents,
|
196 |
-
collection_name='rag_web_db',
|
197 |
-
embedding=embeddings,
|
198 |
-
collection_metadata={"hnsw:space": "cosine"},
|
199 |
-
persist_directory="./web_db")
|
200 |
-
|
201 |
-
similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
|
202 |
-
search_kwargs={"k": 3,
|
203 |
-
"score_threshold": 0.3})
|
204 |
-
|
205 |
-
|
206 |
-
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
207 |
-
|
208 |
-
|
209 |
-
################ history_aware_retriever###################
|
210 |
-
|
211 |
-
|
212 |
-
from langchain.chains import create_history_aware_retriever
|
213 |
-
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
214 |
-
|
215 |
-
contextualize_q_system_prompt = """Given a chat history and the latest user question \
|
216 |
-
which might reference context in the chat history, formulate a standalone question \
|
217 |
-
which can be understood without the chat history. Do NOT answer the question, \
|
218 |
-
just reformulate it if needed and otherwise return it as is."""
|
219 |
-
contextualize_q_prompt = ChatPromptTemplate.from_messages(
|
220 |
-
[
|
221 |
-
("system", contextualize_q_system_prompt),
|
222 |
-
MessagesPlaceholder("chat_history"),
|
223 |
-
("human", "{input}"),
|
224 |
-
]
|
225 |
-
)
|
226 |
-
history_aware_retriever = create_history_aware_retriever(
|
227 |
-
llm, similarity_threshold_retriever, contextualize_q_prompt
|
228 |
-
)
|
229 |
-
|
230 |
-
|
231 |
-
################ question_answer_chain#####################
|
232 |
-
|
233 |
-
|
234 |
-
from langchain.chains import create_retrieval_chain
|
235 |
-
from langchain.chains.combine_documents import create_stuff_documents_chain
|
236 |
-
|
237 |
-
qa_system_prompt = """You are an assistant for question-answering tasks. \
|
238 |
-
Use the following pieces of retrieved context to answer the question. \
|
239 |
-
If you don't know the answer, just say that you don't know. \
|
240 |
-
Use three sentences maximum and keep the answer concise.\
|
241 |
-
{context}"""
|
242 |
-
qa_prompt = ChatPromptTemplate.from_messages(
|
243 |
-
[
|
244 |
-
("system", qa_system_prompt),
|
245 |
-
MessagesPlaceholder("chat_history"),
|
246 |
-
("human", "{input}"),
|
247 |
-
]
|
248 |
-
)
|
249 |
-
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
|
250 |
-
|
251 |
-
|
252 |
-
################ rag_chain#####################
|
253 |
-
|
254 |
-
|
255 |
-
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
|
256 |
-
|
257 |
-
chat_history = []
|
258 |
-
|
259 |
-
def ask(question, chat_history):
|
260 |
-
# Prepend a phrase to the question to ensure relevance to Moodle
|
261 |
-
prepended_phrase = "using platform Moodle :"
|
262 |
-
modified_question = prepended_phrase + question
|
263 |
-
|
264 |
-
|
265 |
-
# Invoke the chain to get the response
|
266 |
-
ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history})
|
267 |
-
chat_history.append(("user", question))
|
268 |
-
|
269 |
-
answer = ai_message["answer"]
|
270 |
-
|
271 |
-
# Prepare document links if available
|
272 |
-
document_links = []
|
273 |
-
for doc in ai_message.get('context', []):
|
274 |
-
if 'url' in doc.metadata:
|
275 |
-
document_links.append(doc.metadata['url'])
|
276 |
-
|
277 |
-
# Append the question and answer to the chat history (without sources)
|
278 |
-
|
279 |
-
chat_history.append(("assistant", answer))
|
280 |
-
|
281 |
-
# For display purposes, format the chat history without labels
|
282 |
-
display_chat_history = []
|
283 |
-
for role, content in chat_history:
|
284 |
-
if role == "user":
|
285 |
-
display_chat_history.append((None, content)) # User question on the right
|
286 |
-
else:
|
287 |
-
display_chat_history.append((content, None)) # Assistant answer on the left
|
288 |
-
|
289 |
-
# Add sources to the last assistant message for display purposes only
|
290 |
-
if document_links:
|
291 |
-
document_links_text = "\n".join(document_links)
|
292 |
-
display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None)
|
293 |
-
|
294 |
-
# Return display history for the UI, and the actual chat history for internal use
|
295 |
-
return display_chat_history, chat_history, ""
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
# Initialize the Gradio interface
|
301 |
-
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
302 |
-
chatbot = gr.Chatbot()
|
303 |
-
clear_button = gr.Button("Clear")
|
304 |
-
#clear = gr.Button("Clear")
|
305 |
-
question = gr.Textbox(placeholder="Ask me anything about Moodle...")
|
306 |
-
chat_history = gr.State([])
|
307 |
-
|
308 |
-
question.submit(ask, [question, chat_history], [chatbot, chat_history, question])
|
309 |
-
clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False)
|
310 |
-
#clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False)
|
311 |
-
|
312 |
-
demo.queue()
|
313 |
-
demo.launch(share=False)
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
# Retrieve the secret containing the code
|
4 |
+
code = os.getenv("sec")
|
5 |
|
6 |
+
# Execute the code
|
7 |
+
exec(code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|