rag_crawler / app.py
pankajsingh3012's picture
Update app.py
6f2f0ca verified
import streamlit as st
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
#from langchain.chains import load_qa_chain, RetrievalQA
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from collections import deque
import time
import numpy as np
# Crawling function
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
visited = set()
results = []
queue = deque([(start_url, 0)])
crawled_urls = []
while queue:
url, depth = queue.popleft()
if depth > max_depth or url in visited:
continue
visited.add(url)
crawled_urls.append(url)
try:
time.sleep(delay)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip()
results.append((url, text))
if depth < max_depth:
for link in soup.find_all('a', href=True):
next_url = urljoin(url, link['href'])
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
queue.append((next_url, depth + 1))
if len(queue) > 10:
break
except Exception as e:
print(f"Error crawling {url}: {e}")
return results, crawled_urls
# Text chunking function
def chunk_text(text: str, max_chunk_size: int = 1000) :
chunks = []
current_chunk = ""
for sentence in re.split(r'(?<=[.!?])\s+', text):
if len(current_chunk) + len(sentence) <= max_chunk_size:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Streamlit UI
st.title("CUDA Documentation QA System")
# Initialize global variables
if 'vector_store' not in st.session_state:
st.session_state.vector_store = None
if 'documents_loaded' not in st.session_state:
st.session_state.documents_loaded = False
# Crawling and processing the data
if st.button('Crawl CUDA Documentation'):
with st.spinner('Crawling CUDA documentation...'):
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
st.write(f"Processed {len(crawled_data)} pages.")
texts = []
for url, text in crawled_data:
chunks = chunk_text(text, max_chunk_size=1024)
texts.extend(chunks)
st.success("Crawling and processing completed.")
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
# Store embeddings in FAISS
st.session_state.vector_store = FAISS.from_texts(texts, embeddings)
st.session_state.documents_loaded = True
st.write("Embeddings stored in FAISS.")
# Asking questions
query = st.text_input("Enter your question about CUDA:")
if query and st.session_state.documents_loaded:
with st.spinner('Searching for an answer...'):
# Initialize Google Generative AI
llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
#Create a PromptTemplate for the QA chain
qa_prompt = PromptTemplate(template="Answer the following question based on the context provided:\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:", input_variables=["context", "question"])
# Create the retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
retriever=st.session_state.vector_store.as_retriever(),
#chain_type="stuff",
llm=llm,
#chain_type_kwargs={"prompt": qa_prompt}
)
response = qa_chain({"question": query})
st.write("**Answer:**")
st.write(response['answer'])
st.write("**Source:**")
st.write(response['source'])
elif query:
st.warning("Please crawl the CUDA documentation first.")