Spaces:

pankajsingh3012
/

rag_crawler

Sleeping

App Files Files Community

pankajsingh3012 commited on Jul 16, 2024

Commit

f47dd18

verified ·

1 Parent(s): 258e61c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +124 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import streamlit as st
+from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import TextLoader
+from langchain_huggingface  import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_google_genai import GoogleGenerativeAI
+from langchain.prompts import PromptTemplate
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import re
+from collections import deque
+import time
+import numpy as np
+# Crawling function
+def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
+    visited = set()
+    results = []
+    queue = deque([(start_url, 0)])
+    crawled_urls = []
+    while queue:
+        url, depth = queue.popleft()
+        if depth > max_depth or url in visited:
+            continue
+        visited.add(url)
+        crawled_urls.append(url)
+        try:
+            time.sleep(delay)
+            response = requests.get(url)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            text = soup.get_text()
+            text = re.sub(r'\s+', ' ', text).strip()
+            results.append((url, text))
+            if depth < max_depth:
+                for link in soup.find_all('a', href=True):
+                    next_url = urljoin(url, link['href'])
+                    if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
+                        queue.append((next_url, depth + 1))
+                    if len(queue) > 10:
+                        break
+        except Exception as e:
+            print(f"Error crawling {url}: {e}")
+    return results, crawled_urls
+# Text chunking function
+def chunk_text(text: str, max_chunk_size: int = 1000) :
+    chunks = []
+    current_chunk = ""
+    for sentence in re.split(r'(?<=[.!?])\s+', text):
+        if len(current_chunk) + len(sentence) <= max_chunk_size:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Streamlit UI
+st.title("CUDA Documentation QA System")
+# Initialize global variables
+vector_store = None
+documents_loaded = False
+# Crawling and processing the data
+if st.button('Crawl CUDA Documentation'):
+    with st.spinner('Crawling CUDA documentation...'):
+        crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
+        st.write(f"Processed {len(crawled_data)} pages.")
+        texts = []
+        for url, text in crawled_data:
+            chunks = chunk_text(text, max_chunk_size=1024)
+            texts.extend(chunks)
+        st.success("Crawling and processing completed.")
+        # Create embeddings
+        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
+                                           model_kwargs={'device': 'cpu'})
+        # Store embeddings in FAISS
+        vector_store = FAISS.from_texts(texts, embeddings)
+        st.write("Embeddings stored in FAISS.")
+# Asking questions
+query = st.text_input("Enter your question about CUDA:")
+if query:
+    with st.spinner('Searching for an answer...'):
+        # Initialize Google Generative AI
+        llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
+        # Create a PromptTemplate for the QA chain
+        qa_prompt = PromptTemplate(
+            template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
+            input_variables=["context", "question"])
+        # Create the retrieval QA chain
+        qa_chain = RetrievalQA.from_chain_type(
+            chain_type="map_rerank",
+            retriever=vector_store.as_retriever(),
+            combine_documents_chain=qa_prompt,
+            llm=llm
+        )
+        response = qa_chain({"question": query})
+        st.write("**Answer:**")
+        st.write(response['answer'])
+        st.write("**Source:**")
+        st.write(response['source'])

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+beautifulsoup4==4.10.0
+faiss-cpu==1.7.0
+numpy==1.21.5
+requests==2.26.0
+sentence-transformers==2.1.0
+streamlit==1.4.1
+torch==1.10.0
+transformers==4.13.0
+langchain==0.0.1
+langchain-community==0.0.1
+langchain-google-genai==0.0.1