File size: 7,507 Bytes
f0ce754 15f9bf1 d98a5f2 f0ce754 91e4a90 52a90bb f0ce754 91e4a90 f0ce754 91e4a90 d98a5f2 91e4a90 f6a1e13 91e4a90 d98a5f2 91e4a90 15f9bf1 d98a5f2 15f9bf1 91e4a90 f6a1e13 ab8f2a3 f6a1e13 4063643 91e4a90 ab8f2a3 91e4a90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Explicitly download the missing resource
nltk.download('averaged_perceptron_tagger_eng')
import os
import streamlit as st
import pickle
import faiss
import time
import numpy as np
from langchain_openai import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document
import requests
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env (especially openai api key)
# api_key = os.getenv("OPENAI_API_KEY")
# if not api_key:
# st.error("OpenAI API Key not found. Ensure it's set in the environment variables.")
# else:
# st.write("OpenAI API Key loaded successfully.")
st.title("Article/News Research Tool")
st.sidebar.title("Article URLs...")
# Initialize session state for Q&A history
if "qa_history" not in st.session_state:
st.session_state.qa_history = []
# Ask the user how many URLs they want to input
num_urls = st.sidebar.number_input("How many URLs do you want to process?", min_value=1, max_value=10, value=2)
urls = []
for i in range(num_urls):
url = st.sidebar.text_input(f"URL {i+1}")
urls.append(url)
# urls = []
# for i in range(3):
# url = st.sidebar.text_input(f"URL {i+1}")
# urls.append(url)
process_url_clicked = st.sidebar.button("Process Article URLs")
# file_path = "faiss_store_openai.pkl"
#
main_placeholder = st.empty()
llm = OpenAI(temperature=0.5, max_tokens=500)
index_path = "faiss_index.bin"
docs_path = "docs.pkl"
index_to_docstore_id_path = "index_to_docstore_id.pkl"
if process_url_clicked:
for url in urls:
try:
response = requests.get(url)
print(f"Status for {url}: {response.status_code}")
except Exception as e:
print(f"Error accessing {url}: {e}")
# load data
loader = UnstructuredURLLoader(urls=urls)
main_placeholder.text("Data Loading...Initiated...")
try:
data = loader.load()
if not data:
st.error("No data loaded from the provided URLs. Please check the URLs.")
else:
for i, doc in enumerate(data):
print(f"Document {i}: {doc.page_content[:100]}") # Preview first 100 characters
except Exception as e:
print(f"Error loading URLs: {e}")
st.error(f"Error loading URLs: {e}")
# split data
text_splitter = RecursiveCharacterTextSplitter(
# separators=['\n\n', '\n', '.', ','],
chunk_size=1000,
chunk_overlap=200
)
docs = text_splitter.split_documents(data)
print(f"Number of chunks created: {len(docs)}")
for i, doc in enumerate(docs[:5]): # Limit output
print(f"Chunk {i}: {doc.page_content[:100]}") # Preview first 100 characters
main_placeholder.text("Text Splitter...Initiated...")
docs = text_splitter.split_documents(data)
# create embeddings and save it to FAISS index
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
docstore_dict = {str(i): doc for i, doc in enumerate(docs)}
docstore = InMemoryDocstore(docstore_dict)
# Create FAISS vector index
index = faiss.IndexFlatL2(embedding_dimension)
# Initialize the FAISS vector store with a correct mapping
index_to_docstore_id = {i: str(i) for i in range(len(docs))}
vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
# Add documents to the FAISS index
for i, vector in enumerate([embeddings.embed_query(doc.page_content) for doc in docs]):
print(f"Adding vector {i} to FAISS index.")
index.add(np.array([vector]))
# vector_store.add_documents(docs)
main_placeholder.text("Embedding Vector Building Initiated...")
# Save the FAISS index and documents separately
# index_path = "faiss_index.bin"
faiss.write_index(vector_store.index, index_path)
# docs_path = "docs.pkl"
with open(docs_path, "wb") as f:
pickle.dump(docs, f)
# Save the index_to_docstore_id mapping
# index_to_docstore_id_path = "index_to_docstore_id.pkl"
with open(index_to_docstore_id_path, "wb") as f:
pickle.dump(vector_store.index_to_docstore_id, f)
query = main_placeholder.text_input("Question: ")
if query:
if not process_url_clicked:
# Load the FAISS index and documents
if os.path.exists(index_path) and os.path.exists(docs_path) and os.path.exists(index_to_docstore_id_path):
# st.write("Files loaded successfully.")
# else:
# st.write("One or more files are missing:")
# if not os.path.exists(index_path):
# st.write(f"Missing: {index_path}")
# if not os.path.exists(docs_path):
# st.write(f"Missing: {docs_path}")
# if not os.path.exists(index_to_docstore_id_path):
# st.write(f"Missing: {index_to_docstore_id_path}")
# st.write("Loading precomputed FAISS index and documents...")
index = faiss.read_index(index_path)
with open(docs_path, "rb") as f:
docs = pickle.load(f)
with open(index_to_docstore_id_path, "rb") as f:
index_to_docstore_id = pickle.load(f)
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs)})
# print(f"Loaded document store keys: {list(docstore._dict.keys())[:10]}") # Debug output
embeddings = OpenAIEmbeddings() # Recreate embeddings object
vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore,
index_to_docstore_id=index_to_docstore_id)
else:
st.error("Precomputed files are missing. Please upload them.")
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())
result = chain.invoke({"question": query}, return_only_outputs=True)
# st.write(f"Raw result: {result}")
# Extract and display the result
answer = result.get("answer", "No answer found.")
sources = result.get("sources", "No sources available.")
# Add to session state history
st.session_state.qa_history.append({"question": query, "answer": answer, "sources": sources})
# result will be a dictionary of this format --> {"answer": "", "sources": [] }
st.subheader("Response:")
st.write(result["answer"])
# Display sources, if available
sources = result.get("sources", "")
if sources:
st.subheader("Sources:")
sources_list = sources.split("\n") # Split the sources by newline
for source in sources_list:
st.write(source)
# Display all questions and answers from the session
if st.session_state.qa_history:
st.write("---------------------------------------------------------------------")
st.subheader("History:")
for entry in st.session_state.qa_history:
st.write(f"**Q:** {entry['question']}")
st.write(f"**A:** {entry['answer']}")
st.write(f"**Sources:** {entry['sources']}")
|