Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import nest_asyncio
|
3 |
+
import requests
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
import numpy as np
|
7 |
+
import faiss
|
8 |
+
from langchain_community.document_loaders import AsyncChromiumLoader
|
9 |
+
from langchain_community.document_transformers import Html2TextTransformer
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
+
from langchain_ollama import OllamaLLM
|
12 |
+
#from langchain_ollama import OllamaEmbeddings
|
13 |
+
from langchain_groq import ChatGroq
|
14 |
+
from itertools import chain
|
15 |
+
from sentence_transformers import SentenceTransformer
|
16 |
+
from langchain_community.vectorstores import FAISS
|
17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
+
|
19 |
+
|
20 |
+
async def process_urls(urls):
|
21 |
+
# Load multiple URLs asynchronously
|
22 |
+
loader = AsyncChromiumLoader(urls)
|
23 |
+
docs = await loader.aload()
|
24 |
+
|
25 |
+
# Transform HTML to text
|
26 |
+
text_transformer = Html2TextTransformer()
|
27 |
+
transformed_docs = text_transformer.transform_documents(docs)
|
28 |
+
|
29 |
+
# Split the text into chunks and retain metadata
|
30 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
|
31 |
+
split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
|
32 |
+
#split_docs = text_splitter.split_documents(transformed_docs)
|
33 |
+
split_docs = list(chain.from_iterable(split_docs_nested))
|
34 |
+
# Attach the source URL to each split document
|
35 |
+
for doc in split_docs:
|
36 |
+
doc.metadata["source_url"] = doc.metadata.get("source", "Unknown") # Ensure URL metadata exists
|
37 |
+
|
38 |
+
return split_docs
|
39 |
+
|
40 |
+
def clean_text(text):
|
41 |
+
"""Remove unnecessary whitespace, line breaks, and special characters."""
|
42 |
+
text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
|
43 |
+
text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove bracketed text (e.g., [advert])
|
44 |
+
return text
|
45 |
+
|
46 |
+
|
47 |
+
def embed_text(text_list):
|
48 |
+
embeddings = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
|
49 |
+
#return embeddings.encode(text_list)
|
50 |
+
if embeddings is None or len(embeddings) == 0:
|
51 |
+
raise ValueError("Embedding function returned an empty result.")
|
52 |
+
return embeddings.encode(text_list)
|
53 |
+
|
54 |
+
|
55 |
+
def store_embeddings(docs):
|
56 |
+
"""Convert text into embeddings and store them in FAISS."""
|
57 |
+
#all_text = [clean_text(doc.page_content) for doc in docs if doc.page_content]
|
58 |
+
all_text = [clean_text(doc.page_content) for doc in docs if hasattr(doc, "page_content")]
|
59 |
+
text_sources = [doc.metadata["source_url"] for doc in docs]
|
60 |
+
|
61 |
+
embeddings = embed_text(all_text)
|
62 |
+
if embeddings is None or embeddings.size == 0:
|
63 |
+
raise ValueError("Embedding function returned None or empty list.")
|
64 |
+
|
65 |
+
embeddings = np.array(embeddings, dtype=np.float32)
|
66 |
+
# Normalize embeddings for better FAISS similarity search
|
67 |
+
faiss.normalize_L2(embeddings)
|
68 |
+
d = embeddings.shape[1]
|
69 |
+
index = faiss.IndexFlatIP(d) # Inner Product (cosine similarity)
|
70 |
+
index.add(embeddings)
|
71 |
+
|
72 |
+
return index, all_text, text_sources
|
73 |
+
|
74 |
+
def search_faiss(index, query_embedding, text_data, text_sources, top_k=5, min_score=0.5):
|
75 |
+
#query_embedding = np.array([query_embedding], dtype=np.float32)
|
76 |
+
query_embedding = query_embedding.reshape(1, -1)
|
77 |
+
faiss.normalize_L2(query_embedding) # Normalize query embedding for similarity
|
78 |
+
|
79 |
+
distances, indices = index.search(query_embedding, top_k)
|
80 |
+
|
81 |
+
results = []
|
82 |
+
if indices.size > 0:
|
83 |
+
for i in range(len(indices[0])):
|
84 |
+
if distances[0][i] >= min_score: # Ignore irrelevant results
|
85 |
+
idx = indices[0][i]
|
86 |
+
if idx < len(text_data):
|
87 |
+
results.append({"source": text_sources[idx], "content": text_data[idx]})
|
88 |
+
|
89 |
+
return results
|
90 |
+
|
91 |
+
def query_llm(index, text_data, text_sources, query):
|
92 |
+
groq_api="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"
|
93 |
+
chat = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
|
94 |
+
|
95 |
+
# Embed the query
|
96 |
+
query_embedding = embed_text([query])[0]
|
97 |
+
|
98 |
+
# Search FAISS for relevant documents
|
99 |
+
relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
|
100 |
+
print(type(relevant_docs))
|
101 |
+
print(relevant_docs)
|
102 |
+
|
103 |
+
# If no relevant docs, return a default message
|
104 |
+
if not relevant_docs:
|
105 |
+
return "No relevant information found."
|
106 |
+
|
107 |
+
# Query LLM with retrieved content
|
108 |
+
responses = []
|
109 |
+
for doc in relevant_docs:
|
110 |
+
if isinstance(doc, dict) and "source" in doc and "content" in doc:
|
111 |
+
source_url = doc["source"]
|
112 |
+
content = doc["content"][:10000]
|
113 |
+
else:
|
114 |
+
print(f"Unexpected doc format: {doc}") # Debugging print
|
115 |
+
continue
|
116 |
+
|
117 |
+
prompt = f"""
|
118 |
+
Based on the following content, answer the question: "{query}"
|
119 |
+
|
120 |
+
Content (from {source_url}):
|
121 |
+
{content}
|
122 |
+
|
123 |
+
"
|
124 |
+
"""
|
125 |
+
response = chat.invoke(prompt)
|
126 |
+
#print(type(response))
|
127 |
+
responses.append({"source": source_url, "response": response})
|
128 |
+
|
129 |
+
return responses
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
#urls = ["https://edition.cnn.com/", "https://www.bbc.com/", "https://www.vanguardngr.com/"]
|
135 |
+
|
136 |
+
# query = "Where is Nigeria located"
|
137 |
+
|
138 |
+
# async def main():
|
139 |
+
# urls = ["https://en.wikipedia.org/wiki/Nigeria","https://en.wikipedia.org/wiki/Ghana"] # Replace with actual URLs
|
140 |
+
# split_docs = await process_urls(urls)
|
141 |
+
# print(split_docs)
|
142 |
+
|
143 |
+
|
144 |
+
#split_docs = process_urls(urls)
|
145 |
+
#print(split_docs)
|
146 |
+
#print(split_docs)
|
147 |
+
# index, text_data, text_sources = store_embeddings(split_docs)
|
148 |
+
# query_embedding = np.array([embed_text([query])[0]])#, dtype=np.float32)
|
149 |
+
# query_embedding = query_embedding.reshape(1, -1)
|
150 |
+
#print(split_docs[0].page_content)
|
151 |
+
#print(index)
|
152 |
+
#print(text_data)
|
153 |
+
#print(text_sources)
|
154 |
+
# response = query_llm(index, text_data, text_sources, query)
|
155 |
+
# print(response)
|
156 |
+
|
157 |
+
|
158 |
+
#print(query_embedding.shape)
|
159 |
+
#relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
|
160 |
+
#print(relevant_docs)
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
query = "Where is Nigeria located"
|
165 |
+
|
166 |
+
async def main():
|
167 |
+
urls = ["https://en.wikipedia.org/wiki/Nigeria", "https://en.wikipedia.org/wiki/Ghana"]
|
168 |
+
split_docs = await process_urls(urls)
|
169 |
+
|
170 |
+
# Ensure split_docs is available before using it
|
171 |
+
index, text_data, text_sources = store_embeddings(split_docs)
|
172 |
+
|
173 |
+
query_embedding = np.array([embed_text([query])[0]])
|
174 |
+
query_embedding = query_embedding.reshape(1, -1)
|
175 |
+
|
176 |
+
response = query_llm(index, text_data, text_sources, query)
|
177 |
+
print(response)
|
178 |
+
|
179 |
+
# Run the async function
|
180 |
+
asyncio.run(main())
|