pankajsingh3012 commited on
Commit
6ef000b
·
verified ·
1 Parent(s): d400246

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -124
app.py CHANGED
@@ -1,124 +1,124 @@
1
- import streamlit as st
2
- from langchain_community.vectorstores import FAISS
3
- from langchain_community.document_loaders import TextLoader
4
- from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain.chains import RetrievalQA
6
- from langchain_google_genai import GoogleGenerativeAI
7
- from langchain.prompts import PromptTemplate
8
- import requests
9
- from bs4 import BeautifulSoup
10
- from urllib.parse import urljoin
11
- import re
12
- from collections import deque
13
- import time
14
- import numpy as np
15
-
16
-
17
- # Crawling function
18
- def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
19
- visited = set()
20
- results = []
21
- queue = deque([(start_url, 0)])
22
- crawled_urls = []
23
-
24
- while queue:
25
- url, depth = queue.popleft()
26
-
27
- if depth > max_depth or url in visited:
28
- continue
29
-
30
- visited.add(url)
31
- crawled_urls.append(url)
32
-
33
- try:
34
- time.sleep(delay)
35
- response = requests.get(url)
36
- soup = BeautifulSoup(response.text, 'html.parser')
37
-
38
- text = soup.get_text()
39
- text = re.sub(r'\s+', ' ', text).strip()
40
-
41
- results.append((url, text))
42
-
43
- if depth < max_depth:
44
- for link in soup.find_all('a', href=True):
45
- next_url = urljoin(url, link['href'])
46
- if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
47
- queue.append((next_url, depth + 1))
48
- if len(queue) > 10:
49
- break
50
- except Exception as e:
51
- print(f"Error crawling {url}: {e}")
52
-
53
- return results, crawled_urls
54
-
55
-
56
- # Text chunking function
57
- def chunk_text(text: str, max_chunk_size: int = 1000) :
58
- chunks = []
59
- current_chunk = ""
60
-
61
- for sentence in re.split(r'(?<=[.!?])\s+', text):
62
- if len(current_chunk) + len(sentence) <= max_chunk_size:
63
- current_chunk += sentence + " "
64
- else:
65
- chunks.append(current_chunk.strip())
66
- current_chunk = sentence + " "
67
-
68
- if current_chunk:
69
- chunks.append(current_chunk.strip())
70
-
71
- return chunks
72
-
73
-
74
- # Streamlit UI
75
- st.title("CUDA Documentation QA System")
76
- # Initialize global variables
77
- vector_store = None
78
- documents_loaded = False
79
-
80
- # Crawling and processing the data
81
- if st.button('Crawl CUDA Documentation'):
82
- with st.spinner('Crawling CUDA documentation...'):
83
- crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
84
- st.write(f"Processed {len(crawled_data)} pages.")
85
-
86
- texts = []
87
- for url, text in crawled_data:
88
- chunks = chunk_text(text, max_chunk_size=1024)
89
- texts.extend(chunks)
90
- st.success("Crawling and processing completed.")
91
-
92
- # Create embeddings
93
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
94
- model_kwargs={'device': 'cpu'})
95
-
96
- # Store embeddings in FAISS
97
- vector_store = FAISS.from_texts(texts, embeddings)
98
- st.write("Embeddings stored in FAISS.")
99
-
100
- # Asking questions
101
- query = st.text_input("Enter your question about CUDA:")
102
- if query:
103
- with st.spinner('Searching for an answer...'):
104
- # Initialize Google Generative AI
105
- llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
106
-
107
- # Create a PromptTemplate for the QA chain
108
- qa_prompt = PromptTemplate(
109
- template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
110
- input_variables=["context", "question"])
111
-
112
- # Create the retrieval QA chain
113
- qa_chain = RetrievalQA.from_chain_type(
114
- chain_type="map_rerank",
115
- retriever=vector_store.as_retriever(),
116
- combine_documents_chain=qa_prompt,
117
- llm=llm
118
- )
119
-
120
- response = qa_chain({"question": query})
121
- st.write("**Answer:**")
122
- st.write(response['answer'])
123
- st.write("**Source:**")
124
- st.write(response['source'])
 
1
+ import streamlit as st
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.document_loaders import TextLoader
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_google_genai import GoogleGenerativeAI
7
+ from langchain.prompts import PromptTemplate
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import urljoin
11
+ import re
12
+ from collections import deque
13
+ import time
14
+ import numpy as np
15
+
16
+
17
+ # Crawling function
18
+ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
19
+ visited = set()
20
+ results = []
21
+ queue = deque([(start_url, 0)])
22
+ crawled_urls = []
23
+
24
+ while queue:
25
+ url, depth = queue.popleft()
26
+
27
+ if depth > max_depth or url in visited:
28
+ continue
29
+
30
+ visited.add(url)
31
+ crawled_urls.append(url)
32
+
33
+ try:
34
+ time.sleep(delay)
35
+ response = requests.get(url)
36
+ soup = BeautifulSoup(response.text, 'html.parser')
37
+
38
+ text = soup.get_text()
39
+ text = re.sub(r'\s+', ' ', text).strip()
40
+
41
+ results.append((url, text))
42
+
43
+ if depth < max_depth:
44
+ for link in soup.find_all('a', href=True):
45
+ next_url = urljoin(url, link['href'])
46
+ if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
47
+ queue.append((next_url, depth + 1))
48
+ if len(queue) > 10:
49
+ break
50
+ except Exception as e:
51
+ print(f"Error crawling {url}: {e}")
52
+
53
+ return results, crawled_urls
54
+
55
+
56
+ # Text chunking function
57
+ def chunk_text(text: str, max_chunk_size: int = 1000) :
58
+ chunks = []
59
+ current_chunk = ""
60
+
61
+ for sentence in re.split(r'(?<=[.!?])\s+', text):
62
+ if len(current_chunk) + len(sentence) <= max_chunk_size:
63
+ current_chunk += sentence + " "
64
+ else:
65
+ chunks.append(current_chunk.strip())
66
+ current_chunk = sentence + " "
67
+
68
+ if current_chunk:
69
+ chunks.append(current_chunk.strip())
70
+
71
+ return chunks
72
+
73
+
74
+ # Streamlit UI
75
+ st.title("CUDA Documentation QA System")
76
+ # Initialize global variables
77
+ vector_store = None
78
+ documents_loaded = False
79
+
80
+ # Crawling and processing the data
81
+ if st.button('Crawl CUDA Documentation'):
82
+ with st.spinner('Crawling CUDA documentation...'):
83
+ crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
84
+ st.write(f"Processed {len(crawled_data)} pages.")
85
+
86
+ texts = []
87
+ for url, text in crawled_data:
88
+ chunks = chunk_text(text, max_chunk_size=1024)
89
+ texts.extend(chunks)
90
+ st.success("Crawling and processing completed.")
91
+
92
+ # Create embeddings
93
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
94
+ model_kwargs={'device': 'cpu'})
95
+
96
+ # Store embeddings in FAISS
97
+ vector_store = FAISS.from_texts(texts, embeddings)
98
+ st.write("Embeddings stored in FAISS.")
99
+
100
+ # Asking questions
101
+ query = st.text_input("Enter your question about CUDA:")
102
+ if query:
103
+ with st.spinner('Searching for an answer...'):
104
+ # Initialize Google Generative AI
105
+ llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
106
+
107
+ # Create a PromptTemplate for the QA chain
108
+ qa_prompt = PromptTemplate(
109
+ template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
110
+ input_variables=["context", "question"])
111
+
112
+ # Create the retrieval QA chain
113
+ qa_chain = RetrievalQA.from_chain_type(
114
+ chain_type="map_rerank",
115
+ retriever=vector_store.as_retriever(),
116
+ combine_documents_chain=qa_prompt,
117
+ llm=llm
118
+ )
119
+
120
+ response = qa_chain({"question": query})
121
+ st.write("**Answer:**")
122
+ st.write(response['answer'])
123
+ st.write("**Source:**")
124
+ st.write(response['source'])