Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,124 +1,124 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from langchain_community.vectorstores import FAISS
|
3 |
-
from langchain_community.document_loaders import TextLoader
|
4 |
-
from
|
5 |
-
from langchain.chains import RetrievalQA
|
6 |
-
from langchain_google_genai import GoogleGenerativeAI
|
7 |
-
from langchain.prompts import PromptTemplate
|
8 |
-
import requests
|
9 |
-
from bs4 import BeautifulSoup
|
10 |
-
from urllib.parse import urljoin
|
11 |
-
import re
|
12 |
-
from collections import deque
|
13 |
-
import time
|
14 |
-
import numpy as np
|
15 |
-
|
16 |
-
|
17 |
-
# Crawling function
|
18 |
-
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
|
19 |
-
visited = set()
|
20 |
-
results = []
|
21 |
-
queue = deque([(start_url, 0)])
|
22 |
-
crawled_urls = []
|
23 |
-
|
24 |
-
while queue:
|
25 |
-
url, depth = queue.popleft()
|
26 |
-
|
27 |
-
if depth > max_depth or url in visited:
|
28 |
-
continue
|
29 |
-
|
30 |
-
visited.add(url)
|
31 |
-
crawled_urls.append(url)
|
32 |
-
|
33 |
-
try:
|
34 |
-
time.sleep(delay)
|
35 |
-
response = requests.get(url)
|
36 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
37 |
-
|
38 |
-
text = soup.get_text()
|
39 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
40 |
-
|
41 |
-
results.append((url, text))
|
42 |
-
|
43 |
-
if depth < max_depth:
|
44 |
-
for link in soup.find_all('a', href=True):
|
45 |
-
next_url = urljoin(url, link['href'])
|
46 |
-
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
|
47 |
-
queue.append((next_url, depth + 1))
|
48 |
-
if len(queue) > 10:
|
49 |
-
break
|
50 |
-
except Exception as e:
|
51 |
-
print(f"Error crawling {url}: {e}")
|
52 |
-
|
53 |
-
return results, crawled_urls
|
54 |
-
|
55 |
-
|
56 |
-
# Text chunking function
|
57 |
-
def chunk_text(text: str, max_chunk_size: int = 1000) :
|
58 |
-
chunks = []
|
59 |
-
current_chunk = ""
|
60 |
-
|
61 |
-
for sentence in re.split(r'(?<=[.!?])\s+', text):
|
62 |
-
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
63 |
-
current_chunk += sentence + " "
|
64 |
-
else:
|
65 |
-
chunks.append(current_chunk.strip())
|
66 |
-
current_chunk = sentence + " "
|
67 |
-
|
68 |
-
if current_chunk:
|
69 |
-
chunks.append(current_chunk.strip())
|
70 |
-
|
71 |
-
return chunks
|
72 |
-
|
73 |
-
|
74 |
-
# Streamlit UI
|
75 |
-
st.title("CUDA Documentation QA System")
|
76 |
-
# Initialize global variables
|
77 |
-
vector_store = None
|
78 |
-
documents_loaded = False
|
79 |
-
|
80 |
-
# Crawling and processing the data
|
81 |
-
if st.button('Crawl CUDA Documentation'):
|
82 |
-
with st.spinner('Crawling CUDA documentation...'):
|
83 |
-
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
|
84 |
-
st.write(f"Processed {len(crawled_data)} pages.")
|
85 |
-
|
86 |
-
texts = []
|
87 |
-
for url, text in crawled_data:
|
88 |
-
chunks = chunk_text(text, max_chunk_size=1024)
|
89 |
-
texts.extend(chunks)
|
90 |
-
st.success("Crawling and processing completed.")
|
91 |
-
|
92 |
-
# Create embeddings
|
93 |
-
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
|
94 |
-
model_kwargs={'device': 'cpu'})
|
95 |
-
|
96 |
-
# Store embeddings in FAISS
|
97 |
-
vector_store = FAISS.from_texts(texts, embeddings)
|
98 |
-
st.write("Embeddings stored in FAISS.")
|
99 |
-
|
100 |
-
# Asking questions
|
101 |
-
query = st.text_input("Enter your question about CUDA:")
|
102 |
-
if query:
|
103 |
-
with st.spinner('Searching for an answer...'):
|
104 |
-
# Initialize Google Generative AI
|
105 |
-
llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
|
106 |
-
|
107 |
-
# Create a PromptTemplate for the QA chain
|
108 |
-
qa_prompt = PromptTemplate(
|
109 |
-
template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
|
110 |
-
input_variables=["context", "question"])
|
111 |
-
|
112 |
-
# Create the retrieval QA chain
|
113 |
-
qa_chain = RetrievalQA.from_chain_type(
|
114 |
-
chain_type="map_rerank",
|
115 |
-
retriever=vector_store.as_retriever(),
|
116 |
-
combine_documents_chain=qa_prompt,
|
117 |
-
llm=llm
|
118 |
-
)
|
119 |
-
|
120 |
-
response = qa_chain({"question": query})
|
121 |
-
st.write("**Answer:**")
|
122 |
-
st.write(response['answer'])
|
123 |
-
st.write("**Source:**")
|
124 |
-
st.write(response['source'])
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain_community.vectorstores import FAISS
|
3 |
+
from langchain_community.document_loaders import TextLoader
|
4 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain_google_genai import GoogleGenerativeAI
|
7 |
+
from langchain.prompts import PromptTemplate
|
8 |
+
import requests
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from urllib.parse import urljoin
|
11 |
+
import re
|
12 |
+
from collections import deque
|
13 |
+
import time
|
14 |
+
import numpy as np
|
15 |
+
|
16 |
+
|
17 |
+
# Crawling function
|
18 |
+
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
|
19 |
+
visited = set()
|
20 |
+
results = []
|
21 |
+
queue = deque([(start_url, 0)])
|
22 |
+
crawled_urls = []
|
23 |
+
|
24 |
+
while queue:
|
25 |
+
url, depth = queue.popleft()
|
26 |
+
|
27 |
+
if depth > max_depth or url in visited:
|
28 |
+
continue
|
29 |
+
|
30 |
+
visited.add(url)
|
31 |
+
crawled_urls.append(url)
|
32 |
+
|
33 |
+
try:
|
34 |
+
time.sleep(delay)
|
35 |
+
response = requests.get(url)
|
36 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
37 |
+
|
38 |
+
text = soup.get_text()
|
39 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
40 |
+
|
41 |
+
results.append((url, text))
|
42 |
+
|
43 |
+
if depth < max_depth:
|
44 |
+
for link in soup.find_all('a', href=True):
|
45 |
+
next_url = urljoin(url, link['href'])
|
46 |
+
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
|
47 |
+
queue.append((next_url, depth + 1))
|
48 |
+
if len(queue) > 10:
|
49 |
+
break
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error crawling {url}: {e}")
|
52 |
+
|
53 |
+
return results, crawled_urls
|
54 |
+
|
55 |
+
|
56 |
+
# Text chunking function
|
57 |
+
def chunk_text(text: str, max_chunk_size: int = 1000) :
|
58 |
+
chunks = []
|
59 |
+
current_chunk = ""
|
60 |
+
|
61 |
+
for sentence in re.split(r'(?<=[.!?])\s+', text):
|
62 |
+
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
63 |
+
current_chunk += sentence + " "
|
64 |
+
else:
|
65 |
+
chunks.append(current_chunk.strip())
|
66 |
+
current_chunk = sentence + " "
|
67 |
+
|
68 |
+
if current_chunk:
|
69 |
+
chunks.append(current_chunk.strip())
|
70 |
+
|
71 |
+
return chunks
|
72 |
+
|
73 |
+
|
74 |
+
# Streamlit UI
|
75 |
+
st.title("CUDA Documentation QA System")
|
76 |
+
# Initialize global variables
|
77 |
+
vector_store = None
|
78 |
+
documents_loaded = False
|
79 |
+
|
80 |
+
# Crawling and processing the data
|
81 |
+
if st.button('Crawl CUDA Documentation'):
|
82 |
+
with st.spinner('Crawling CUDA documentation...'):
|
83 |
+
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
|
84 |
+
st.write(f"Processed {len(crawled_data)} pages.")
|
85 |
+
|
86 |
+
texts = []
|
87 |
+
for url, text in crawled_data:
|
88 |
+
chunks = chunk_text(text, max_chunk_size=1024)
|
89 |
+
texts.extend(chunks)
|
90 |
+
st.success("Crawling and processing completed.")
|
91 |
+
|
92 |
+
# Create embeddings
|
93 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
|
94 |
+
model_kwargs={'device': 'cpu'})
|
95 |
+
|
96 |
+
# Store embeddings in FAISS
|
97 |
+
vector_store = FAISS.from_texts(texts, embeddings)
|
98 |
+
st.write("Embeddings stored in FAISS.")
|
99 |
+
|
100 |
+
# Asking questions
|
101 |
+
query = st.text_input("Enter your question about CUDA:")
|
102 |
+
if query:
|
103 |
+
with st.spinner('Searching for an answer...'):
|
104 |
+
# Initialize Google Generative AI
|
105 |
+
llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
|
106 |
+
|
107 |
+
# Create a PromptTemplate for the QA chain
|
108 |
+
qa_prompt = PromptTemplate(
|
109 |
+
template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
|
110 |
+
input_variables=["context", "question"])
|
111 |
+
|
112 |
+
# Create the retrieval QA chain
|
113 |
+
qa_chain = RetrievalQA.from_chain_type(
|
114 |
+
chain_type="map_rerank",
|
115 |
+
retriever=vector_store.as_retriever(),
|
116 |
+
combine_documents_chain=qa_prompt,
|
117 |
+
llm=llm
|
118 |
+
)
|
119 |
+
|
120 |
+
response = qa_chain({"question": query})
|
121 |
+
st.write("**Answer:**")
|
122 |
+
st.write(response['answer'])
|
123 |
+
st.write("**Source:**")
|
124 |
+
st.write(response['source'])
|