Spaces:

pankajsingh3012
/

rag_crawler

Sleeping

App Files Files Community

pankajsingh3012 commited on Jul 16, 2024

Commit

6ef000b

verified ·

1 Parent(s): d400246

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -124

app.py CHANGED Viewed

@@ -1,124 +1,124 @@
-import streamlit as st
-from langchain_community.vectorstores import FAISS
-from langchain_community.document_loaders import TextLoader
-from langchain_huggingface  import HuggingFaceEmbeddings
-from langchain.chains import RetrievalQA
-from langchain_google_genai import GoogleGenerativeAI
-from langchain.prompts import PromptTemplate
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
-import re
-from collections import deque
-import time
-import numpy as np
-# Crawling function
-def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
-    visited = set()
-    results = []
-    queue = deque([(start_url, 0)])
-    crawled_urls = []
-    while queue:
-        url, depth = queue.popleft()
-        if depth > max_depth or url in visited:
-            continue
-        visited.add(url)
-        crawled_urls.append(url)
-        try:
-            time.sleep(delay)
-            response = requests.get(url)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            text = soup.get_text()
-            text = re.sub(r'\s+', ' ', text).strip()
-            results.append((url, text))
-            if depth < max_depth:
-                for link in soup.find_all('a', href=True):
-                    next_url = urljoin(url, link['href'])
-                    if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
-                        queue.append((next_url, depth + 1))
-                    if len(queue) > 10:
-                        break
-        except Exception as e:
-            print(f"Error crawling {url}: {e}")
-    return results, crawled_urls
-# Text chunking function
-def chunk_text(text: str, max_chunk_size: int = 1000) :
-    chunks = []
-    current_chunk = ""
-    for sentence in re.split(r'(?<=[.!?])\s+', text):
-        if len(current_chunk) + len(sentence) <= max_chunk_size:
-            current_chunk += sentence + " "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-# Streamlit UI
-st.title("CUDA Documentation QA System")
-# Initialize global variables
-vector_store = None
-documents_loaded = False
-# Crawling and processing the data
-if st.button('Crawl CUDA Documentation'):
-    with st.spinner('Crawling CUDA documentation...'):
-        crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
-        st.write(f"Processed {len(crawled_data)} pages.")
-        texts = []
-        for url, text in crawled_data:
-            chunks = chunk_text(text, max_chunk_size=1024)
-            texts.extend(chunks)
-        st.success("Crawling and processing completed.")
-        # Create embeddings
-        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
-                                           model_kwargs={'device': 'cpu'})
-        # Store embeddings in FAISS
-        vector_store = FAISS.from_texts(texts, embeddings)
-        st.write("Embeddings stored in FAISS.")
-# Asking questions
-query = st.text_input("Enter your question about CUDA:")
-if query:
-    with st.spinner('Searching for an answer...'):
-        # Initialize Google Generative AI
-        llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
-        # Create a PromptTemplate for the QA chain
-        qa_prompt = PromptTemplate(
-            template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
-            input_variables=["context", "question"])
-        # Create the retrieval QA chain
-        qa_chain = RetrievalQA.from_chain_type(
-            chain_type="map_rerank",
-            retriever=vector_store.as_retriever(),
-            combine_documents_chain=qa_prompt,
-            llm=llm
-        )
-        response = qa_chain({"question": query})
-        st.write("**Answer:**")
-        st.write(response['answer'])
-        st.write("**Source:**")
-        st.write(response['source'])

+import streamlit as st
+from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import TextLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_google_genai import GoogleGenerativeAI
+from langchain.prompts import PromptTemplate
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import re
+from collections import deque
+import time
+import numpy as np
+# Crawling function
+def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
+    visited = set()
+    results = []
+    queue = deque([(start_url, 0)])
+    crawled_urls = []
+    while queue:
+        url, depth = queue.popleft()
+        if depth > max_depth or url in visited:
+            continue
+        visited.add(url)
+        crawled_urls.append(url)
+        try:
+            time.sleep(delay)
+            response = requests.get(url)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            text = soup.get_text()
+            text = re.sub(r'\s+', ' ', text).strip()
+            results.append((url, text))
+            if depth < max_depth:
+                for link in soup.find_all('a', href=True):
+                    next_url = urljoin(url, link['href'])
+                    if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
+                        queue.append((next_url, depth + 1))
+                    if len(queue) > 10:
+                        break
+        except Exception as e:
+            print(f"Error crawling {url}: {e}")
+    return results, crawled_urls
+# Text chunking function
+def chunk_text(text: str, max_chunk_size: int = 1000) :
+    chunks = []
+    current_chunk = ""
+    for sentence in re.split(r'(?<=[.!?])\s+', text):
+        if len(current_chunk) + len(sentence) <= max_chunk_size:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Streamlit UI
+st.title("CUDA Documentation QA System")
+# Initialize global variables
+vector_store = None
+documents_loaded = False
+# Crawling and processing the data
+if st.button('Crawl CUDA Documentation'):
+    with st.spinner('Crawling CUDA documentation...'):
+        crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
+        st.write(f"Processed {len(crawled_data)} pages.")
+        texts = []
+        for url, text in crawled_data:
+            chunks = chunk_text(text, max_chunk_size=1024)
+            texts.extend(chunks)
+        st.success("Crawling and processing completed.")
+        # Create embeddings
+        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
+                                           model_kwargs={'device': 'cpu'})
+        # Store embeddings in FAISS
+        vector_store = FAISS.from_texts(texts, embeddings)
+        st.write("Embeddings stored in FAISS.")
+# Asking questions
+query = st.text_input("Enter your question about CUDA:")
+if query:
+    with st.spinner('Searching for an answer...'):
+        # Initialize Google Generative AI
+        llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
+        # Create a PromptTemplate for the QA chain
+        qa_prompt = PromptTemplate(
+            template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
+            input_variables=["context", "question"])
+        # Create the retrieval QA chain
+        qa_chain = RetrievalQA.from_chain_type(
+            chain_type="map_rerank",
+            retriever=vector_store.as_retriever(),
+            combine_documents_chain=qa_prompt,
+            llm=llm
+        )
+        response = qa_chain({"question": query})
+        st.write("**Answer:**")
+        st.write(response['answer'])
+        st.write("**Source:**")
+        st.write(response['source'])