Samizie commited on
Commit
00be01a
·
verified ·
1 Parent(s): b62ad69

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import nest_asyncio
3
+ import requests
4
+ import pandas as pd
5
+ import re
6
+ import numpy as np
7
+ import faiss
8
+ from langchain_community.document_loaders import AsyncChromiumLoader
9
+ from langchain_community.document_transformers import Html2TextTransformer
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_ollama import OllamaLLM
12
+ #from langchain_ollama import OllamaEmbeddings
13
+ from langchain_groq import ChatGroq
14
+ from itertools import chain
15
+ from sentence_transformers import SentenceTransformer
16
+ from langchain_community.vectorstores import FAISS
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+
19
+
20
+ async def process_urls(urls):
21
+ # Load multiple URLs asynchronously
22
+ loader = AsyncChromiumLoader(urls)
23
+ docs = await loader.aload()
24
+
25
+ # Transform HTML to text
26
+ text_transformer = Html2TextTransformer()
27
+ transformed_docs = text_transformer.transform_documents(docs)
28
+
29
+ # Split the text into chunks and retain metadata
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
31
+ split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
32
+ #split_docs = text_splitter.split_documents(transformed_docs)
33
+ split_docs = list(chain.from_iterable(split_docs_nested))
34
+ # Attach the source URL to each split document
35
+ for doc in split_docs:
36
+ doc.metadata["source_url"] = doc.metadata.get("source", "Unknown") # Ensure URL metadata exists
37
+
38
+ return split_docs
39
+
40
+ def clean_text(text):
41
+ """Remove unnecessary whitespace, line breaks, and special characters."""
42
+ text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
43
+ text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove bracketed text (e.g., [advert])
44
+ return text
45
+
46
+
47
+ def embed_text(text_list):
48
+ embeddings = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
49
+ #return embeddings.encode(text_list)
50
+ if embeddings is None or len(embeddings) == 0:
51
+ raise ValueError("Embedding function returned an empty result.")
52
+ return embeddings.encode(text_list)
53
+
54
+
55
+ def store_embeddings(docs):
56
+ """Convert text into embeddings and store them in FAISS."""
57
+ #all_text = [clean_text(doc.page_content) for doc in docs if doc.page_content]
58
+ all_text = [clean_text(doc.page_content) for doc in docs if hasattr(doc, "page_content")]
59
+ text_sources = [doc.metadata["source_url"] for doc in docs]
60
+
61
+ embeddings = embed_text(all_text)
62
+ if embeddings is None or embeddings.size == 0:
63
+ raise ValueError("Embedding function returned None or empty list.")
64
+
65
+ embeddings = np.array(embeddings, dtype=np.float32)
66
+ # Normalize embeddings for better FAISS similarity search
67
+ faiss.normalize_L2(embeddings)
68
+ d = embeddings.shape[1]
69
+ index = faiss.IndexFlatIP(d) # Inner Product (cosine similarity)
70
+ index.add(embeddings)
71
+
72
+ return index, all_text, text_sources
73
+
74
+ def search_faiss(index, query_embedding, text_data, text_sources, top_k=5, min_score=0.5):
75
+ #query_embedding = np.array([query_embedding], dtype=np.float32)
76
+ query_embedding = query_embedding.reshape(1, -1)
77
+ faiss.normalize_L2(query_embedding) # Normalize query embedding for similarity
78
+
79
+ distances, indices = index.search(query_embedding, top_k)
80
+
81
+ results = []
82
+ if indices.size > 0:
83
+ for i in range(len(indices[0])):
84
+ if distances[0][i] >= min_score: # Ignore irrelevant results
85
+ idx = indices[0][i]
86
+ if idx < len(text_data):
87
+ results.append({"source": text_sources[idx], "content": text_data[idx]})
88
+
89
+ return results
90
+
91
+ def query_llm(index, text_data, text_sources, query):
92
+ groq_api="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"
93
+ chat = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
94
+
95
+ # Embed the query
96
+ query_embedding = embed_text([query])[0]
97
+
98
+ # Search FAISS for relevant documents
99
+ relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
100
+ print(type(relevant_docs))
101
+ print(relevant_docs)
102
+
103
+ # If no relevant docs, return a default message
104
+ if not relevant_docs:
105
+ return "No relevant information found."
106
+
107
+ # Query LLM with retrieved content
108
+ responses = []
109
+ for doc in relevant_docs:
110
+ if isinstance(doc, dict) and "source" in doc and "content" in doc:
111
+ source_url = doc["source"]
112
+ content = doc["content"][:10000]
113
+ else:
114
+ print(f"Unexpected doc format: {doc}") # Debugging print
115
+ continue
116
+
117
+ prompt = f"""
118
+ Based on the following content, answer the question: "{query}"
119
+
120
+ Content (from {source_url}):
121
+ {content}
122
+
123
+ "
124
+ """
125
+ response = chat.invoke(prompt)
126
+ #print(type(response))
127
+ responses.append({"source": source_url, "response": response})
128
+
129
+ return responses
130
+
131
+
132
+
133
+
134
+ #urls = ["https://edition.cnn.com/", "https://www.bbc.com/", "https://www.vanguardngr.com/"]
135
+
136
+ # query = "Where is Nigeria located"
137
+
138
+ # async def main():
139
+ # urls = ["https://en.wikipedia.org/wiki/Nigeria","https://en.wikipedia.org/wiki/Ghana"] # Replace with actual URLs
140
+ # split_docs = await process_urls(urls)
141
+ # print(split_docs)
142
+
143
+
144
+ #split_docs = process_urls(urls)
145
+ #print(split_docs)
146
+ #print(split_docs)
147
+ # index, text_data, text_sources = store_embeddings(split_docs)
148
+ # query_embedding = np.array([embed_text([query])[0]])#, dtype=np.float32)
149
+ # query_embedding = query_embedding.reshape(1, -1)
150
+ #print(split_docs[0].page_content)
151
+ #print(index)
152
+ #print(text_data)
153
+ #print(text_sources)
154
+ # response = query_llm(index, text_data, text_sources, query)
155
+ # print(response)
156
+
157
+
158
+ #print(query_embedding.shape)
159
+ #relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
160
+ #print(relevant_docs)
161
+
162
+
163
+
164
+ query = "Where is Nigeria located"
165
+
166
+ async def main():
167
+ urls = ["https://en.wikipedia.org/wiki/Nigeria", "https://en.wikipedia.org/wiki/Ghana"]
168
+ split_docs = await process_urls(urls)
169
+
170
+ # Ensure split_docs is available before using it
171
+ index, text_data, text_sources = store_embeddings(split_docs)
172
+
173
+ query_embedding = np.array([embed_text([query])[0]])
174
+ query_embedding = query_embedding.reshape(1, -1)
175
+
176
+ response = query_llm(index, text_data, text_sources, query)
177
+ print(response)
178
+
179
+ # Run the async function
180
+ asyncio.run(main())